Skip to main content

coreutils_rs/tr/
core.rs

1use std::io::{self, Read, Write};
2
3use rayon::prelude::*;
4
5/// Maximum IoSlice entries per write_vectored batch.
6/// Linux UIO_MAXIOV is 1024; we use that as our batch limit.
7const MAX_IOV: usize = 1024;
8
9/// Stream buffer: 8MB — matches enlarged pipe buffer size (F_SETPIPE_SZ=8MB).
10/// For 10MB input: 2 iterations (8MB + 2MB) vs 3 iterations (4MB×2 + 2MB),
11/// saving one read() + one write() syscall pair.
12const STREAM_BUF: usize = 8 * 1024 * 1024;
13
14/// Minimum data size to engage rayon parallel processing for mmap/batch paths.
15/// For 10MB benchmark files, parallel tr translate was a 105% REGRESSION
16/// (rayon dispatch overhead exceeds the multi-core benefit for 10MB).
17/// Set to 64MB so only genuinely large files benefit from parallelism.
18/// Single-threaded AVX2 at 16 GB/s processes 10MB in 0.6ms which is fast enough.
19const PARALLEL_THRESHOLD: usize = 64 * 1024 * 1024;
20
21/// 256-entry lookup table for byte compaction: for each 8-bit keep mask,
22/// stores the bit positions of set bits (indices of bytes to keep).
23/// Used by compact_8bytes to replace the serial trailing_zeros loop with
24/// unconditional indexed stores, eliminating the tzcnt→blsr dependency chain.
25/// Total size: 256 * 8 = 2KB — fits entirely in L1 cache.
26#[cfg(target_arch = "x86_64")]
27static COMPACT_LUT: [[u8; 8]; 256] = {
28    let mut lut = [[0u8; 8]; 256];
29    let mut mask: u16 = 0;
30    while mask < 256 {
31        let mut idx: usize = 0;
32        let mut bit: u8 = 0;
33        while bit < 8 {
34            if (mask >> bit) & 1 != 0 {
35                lut[mask as usize][idx] = bit;
36                idx += 1;
37            }
38            bit += 1;
39        }
40        mask += 1;
41    }
42    lut
43};
44
45/// Write multiple IoSlice buffers using write_vectored, batching into MAX_IOV-sized groups.
46/// Falls back to write_all per slice for partial writes.
47#[inline]
48fn write_ioslices(writer: &mut impl Write, slices: &[std::io::IoSlice]) -> io::Result<()> {
49    if slices.is_empty() {
50        return Ok(());
51    }
52    for batch in slices.chunks(MAX_IOV) {
53        let total: usize = batch.iter().map(|s| s.len()).sum();
54        match writer.write_vectored(batch) {
55            Ok(n) if n >= total => continue,
56            Ok(mut written) => {
57                // Partial write: fall back to write_all per remaining slice
58                for slice in batch {
59                    let slen = slice.len();
60                    if written >= slen {
61                        written -= slen;
62                        continue;
63                    }
64                    if written > 0 {
65                        writer.write_all(&slice[written..])?;
66                        written = 0;
67                    } else {
68                        writer.write_all(slice)?;
69                    }
70                }
71            }
72            Err(e) => return Err(e),
73        }
74    }
75    Ok(())
76}
77
78/// Allocate a Vec<u8> of given length without zero-initialization.
79/// Uses MADV_HUGEPAGE on Linux for buffers >= 2MB to reduce TLB misses.
80/// SAFETY: Caller must write all bytes before reading them.
81#[inline]
82#[allow(clippy::uninit_vec)]
83fn alloc_uninit_vec(len: usize) -> Vec<u8> {
84    let mut v = Vec::with_capacity(len);
85    // SAFETY: u8 has no drop, no invalid bit patterns; caller will overwrite before reading
86    unsafe {
87        v.set_len(len);
88    }
89    #[cfg(target_os = "linux")]
90    if len >= 2 * 1024 * 1024 {
91        unsafe {
92            libc::madvise(
93                v.as_mut_ptr() as *mut libc::c_void,
94                len,
95                libc::MADV_HUGEPAGE,
96            );
97        }
98    }
99    v
100}
101
102/// Build a 256-byte lookup table mapping set1[i] -> set2[i].
103#[inline]
104fn build_translate_table(set1: &[u8], set2: &[u8]) -> [u8; 256] {
105    let mut table: [u8; 256] = std::array::from_fn(|i| i as u8);
106    let last = set2.last().copied();
107    for (i, &from) in set1.iter().enumerate() {
108        table[from as usize] = if i < set2.len() {
109            set2[i]
110        } else {
111            last.unwrap_or(from)
112        };
113    }
114    table
115}
116
117/// Build a 256-bit (32-byte) membership set for O(1) byte lookup.
118#[inline]
119fn build_member_set(chars: &[u8]) -> [u8; 32] {
120    let mut set = [0u8; 32];
121    for &ch in chars {
122        set[ch as usize >> 3] |= 1 << (ch & 7);
123    }
124    set
125}
126
127#[inline(always)]
128fn is_member(set: &[u8; 32], ch: u8) -> bool {
129    unsafe { (*set.get_unchecked(ch as usize >> 3) & (1 << (ch & 7))) != 0 }
130}
131
132/// Cached SIMD capability level for x86_64.
133/// 0 = unchecked, 1 = scalar only, 2 = SSSE3, 3 = AVX2
134#[cfg(target_arch = "x86_64")]
135static SIMD_LEVEL: std::sync::atomic::AtomicU8 = std::sync::atomic::AtomicU8::new(0);
136
137#[cfg(target_arch = "x86_64")]
138#[inline(always)]
139fn get_simd_level() -> u8 {
140    let level = SIMD_LEVEL.load(std::sync::atomic::Ordering::Relaxed);
141    if level != 0 {
142        return level;
143    }
144    let detected = if is_x86_feature_detected!("avx2") {
145        3
146    } else if is_x86_feature_detected!("ssse3") {
147        2
148    } else {
149        1
150    };
151    SIMD_LEVEL.store(detected, std::sync::atomic::Ordering::Relaxed);
152    detected
153}
154
155/// Count how many entries in the translate table are non-identity.
156#[cfg(target_arch = "x86_64")]
157#[inline]
158fn count_non_identity(table: &[u8; 256]) -> usize {
159    table
160        .iter()
161        .enumerate()
162        .filter(|&(i, &v)| v != i as u8)
163        .count()
164}
165
166/// Translate bytes in-place using a 256-byte lookup table.
167/// For sparse translations (few bytes change), uses SIMD skip-ahead:
168/// compare 32 bytes at a time against identity, skip unchanged chunks.
169/// For dense translations, uses full SIMD nibble decomposition.
170/// Falls back to 8x-unrolled scalar on non-x86_64 platforms.
171#[inline(always)]
172fn translate_inplace(data: &mut [u8], table: &[u8; 256]) {
173    #[cfg(target_arch = "x86_64")]
174    {
175        let level = get_simd_level();
176        if level >= 3 {
177            // For sparse translations (<=16 non-identity entries), the skip-ahead
178            // approach is faster: load 32 bytes, do a full nibble lookup, compare
179            // against input, skip store if identical. This avoids writing to pages
180            // that don't change (important for MAP_PRIVATE COW mmap).
181            let non_id = count_non_identity(table);
182            if non_id > 0 && non_id <= 16 {
183                unsafe { translate_inplace_avx2_sparse(data, table) };
184                return;
185            }
186            unsafe { translate_inplace_avx2_table(data, table) };
187            return;
188        }
189        if level >= 2 {
190            unsafe { translate_inplace_ssse3_table(data, table) };
191            return;
192        }
193    }
194    translate_inplace_scalar(data, table);
195}
196
197/// Sparse AVX2 translate: skip unchanged 32-byte chunks.
198/// For each chunk: perform full nibble lookup, compare result vs input.
199/// If identical (no bytes changed), skip the store entirely.
200/// This reduces memory bandwidth and avoids COW page faults for
201/// MAP_PRIVATE mmaps when most bytes are unchanged.
202#[cfg(target_arch = "x86_64")]
203#[target_feature(enable = "avx2")]
204unsafe fn translate_inplace_avx2_sparse(data: &mut [u8], table: &[u8; 256]) {
205    use std::arch::x86_64::*;
206
207    unsafe {
208        let len = data.len();
209        let ptr = data.as_mut_ptr();
210
211        // Pre-build 16 lookup vectors (same as full nibble decomposition)
212        let mut lut = [_mm256_setzero_si256(); 16];
213        for h in 0u8..16 {
214            let base = (h as usize) * 16;
215            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
216            let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
217            lut[h as usize] = _mm256_broadcastsi128_si256(row128);
218        }
219
220        let lo_mask = _mm256_set1_epi8(0x0F);
221
222        let mut i = 0;
223        while i + 32 <= len {
224            let input = _mm256_loadu_si256(ptr.add(i) as *const _);
225            let lo_nibble = _mm256_and_si256(input, lo_mask);
226            let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
227
228            let mut result = _mm256_setzero_si256();
229            macro_rules! do_nibble {
230                ($h:expr) => {
231                    let h_val = _mm256_set1_epi8($h as i8);
232                    let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
233                    let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
234                    result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
235                };
236            }
237            do_nibble!(0);
238            do_nibble!(1);
239            do_nibble!(2);
240            do_nibble!(3);
241            do_nibble!(4);
242            do_nibble!(5);
243            do_nibble!(6);
244            do_nibble!(7);
245            do_nibble!(8);
246            do_nibble!(9);
247            do_nibble!(10);
248            do_nibble!(11);
249            do_nibble!(12);
250            do_nibble!(13);
251            do_nibble!(14);
252            do_nibble!(15);
253
254            // Only store if result differs from input (skip unchanged chunks)
255            let diff = _mm256_xor_si256(input, result);
256            if _mm256_testz_si256(diff, diff) == 0 {
257                _mm256_storeu_si256(ptr.add(i) as *mut _, result);
258            }
259            i += 32;
260        }
261
262        // Scalar tail
263        while i < len {
264            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
265            i += 1;
266        }
267    }
268}
269
270/// Scalar fallback: 8x-unrolled table lookup.
271#[cfg(not(target_arch = "aarch64"))]
272#[inline(always)]
273fn translate_inplace_scalar(data: &mut [u8], table: &[u8; 256]) {
274    let len = data.len();
275    let ptr = data.as_mut_ptr();
276    let mut i = 0;
277    unsafe {
278        while i + 8 <= len {
279            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
280            *ptr.add(i + 1) = *table.get_unchecked(*ptr.add(i + 1) as usize);
281            *ptr.add(i + 2) = *table.get_unchecked(*ptr.add(i + 2) as usize);
282            *ptr.add(i + 3) = *table.get_unchecked(*ptr.add(i + 3) as usize);
283            *ptr.add(i + 4) = *table.get_unchecked(*ptr.add(i + 4) as usize);
284            *ptr.add(i + 5) = *table.get_unchecked(*ptr.add(i + 5) as usize);
285            *ptr.add(i + 6) = *table.get_unchecked(*ptr.add(i + 6) as usize);
286            *ptr.add(i + 7) = *table.get_unchecked(*ptr.add(i + 7) as usize);
287            i += 8;
288        }
289        while i < len {
290            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
291            i += 1;
292        }
293    }
294}
295
296/// ARM64 NEON table lookup using nibble decomposition (same algorithm as x86 pshufb).
297/// Uses vqtbl1q_u8 for 16-byte table lookups, processes 16 bytes per iteration.
298#[cfg(target_arch = "aarch64")]
299#[inline(always)]
300fn translate_inplace_scalar(data: &mut [u8], table: &[u8; 256]) {
301    unsafe { translate_inplace_neon_table(data, table) };
302}
303
304#[cfg(target_arch = "aarch64")]
305#[target_feature(enable = "neon")]
306unsafe fn translate_inplace_neon_table(data: &mut [u8], table: &[u8; 256]) {
307    use std::arch::aarch64::*;
308
309    unsafe {
310        let len = data.len();
311        let ptr = data.as_mut_ptr();
312
313        // Pre-build 16 NEON lookup vectors (one per high nibble)
314        let mut lut: [uint8x16_t; 16] = [vdupq_n_u8(0); 16];
315        for h in 0u8..16 {
316            let base = (h as usize) * 16;
317            lut[h as usize] = vld1q_u8(table.as_ptr().add(base));
318        }
319
320        let lo_mask = vdupq_n_u8(0x0F);
321        let mut i = 0;
322
323        while i + 16 <= len {
324            let input = vld1q_u8(ptr.add(i));
325            let lo_nibble = vandq_u8(input, lo_mask);
326            let hi_nibble = vandq_u8(vshrq_n_u8(input, 4), lo_mask);
327
328            let mut result = vdupq_n_u8(0);
329            macro_rules! do_nibble {
330                ($h:expr) => {
331                    let h_val = vdupq_n_u8($h);
332                    let mask = vceqq_u8(hi_nibble, h_val);
333                    let looked_up = vqtbl1q_u8(lut[$h as usize], lo_nibble);
334                    result = vorrq_u8(result, vandq_u8(mask, looked_up));
335                };
336            }
337            do_nibble!(0);
338            do_nibble!(1);
339            do_nibble!(2);
340            do_nibble!(3);
341            do_nibble!(4);
342            do_nibble!(5);
343            do_nibble!(6);
344            do_nibble!(7);
345            do_nibble!(8);
346            do_nibble!(9);
347            do_nibble!(10);
348            do_nibble!(11);
349            do_nibble!(12);
350            do_nibble!(13);
351            do_nibble!(14);
352            do_nibble!(15);
353
354            vst1q_u8(ptr.add(i), result);
355            i += 16;
356        }
357
358        // Scalar tail
359        while i < len {
360            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
361            i += 1;
362        }
363    }
364}
365
366// ============================================================================
367// SIMD arbitrary table lookup using pshufb nibble decomposition (x86_64)
368// ============================================================================
369//
370// For an arbitrary 256-byte lookup table, we decompose each byte into
371// high nibble (bits 7-4) and low nibble (bits 3-0). We pre-build 16
372// SIMD vectors, one for each high nibble value h (0..15), containing
373// the 16 table entries table[h*16+0..h*16+15]. Then for each input
374// vector we:
375//   1. Extract low nibble (AND 0x0F) -> used as pshufb index
376//   2. Extract high nibble (shift right 4) -> used to select which table
377//   3. For each of the 16 high nibble values, create a mask where
378//      the high nibble equals that value, pshufb the corresponding
379//      table, and accumulate results
380//
381// AVX2 processes 32 bytes/iteration; SSSE3 processes 16 bytes/iteration.
382// With instruction-level parallelism, this achieves much higher throughput
383// than scalar table lookups which have serial data dependencies.
384
385#[cfg(target_arch = "x86_64")]
386#[target_feature(enable = "avx2")]
387unsafe fn translate_inplace_avx2_table(data: &mut [u8], table: &[u8; 256]) {
388    use std::arch::x86_64::*;
389
390    unsafe {
391        let len = data.len();
392        let ptr = data.as_mut_ptr();
393
394        // Pre-build 16 lookup vectors, one per high nibble value.
395        // Each vector holds 32 bytes = 2x 128-bit lanes, each lane has the same
396        // 16 table entries for pshufb indexing by low nibble.
397        let mut lut = [_mm256_setzero_si256(); 16];
398        for h in 0u8..16 {
399            let base = (h as usize) * 16;
400            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
401            // Broadcast the 128-bit row to both lanes of the 256-bit vector
402            let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
403            lut[h as usize] = _mm256_broadcastsi128_si256(row128);
404        }
405
406        let lo_mask = _mm256_set1_epi8(0x0F);
407
408        let mut i = 0;
409
410        // 2x unrolled: process 64 bytes (2x32) per iteration for better ILP.
411        // The CPU can overlap load/compute of the second vector while the first
412        // is in the nibble decomposition pipeline.
413        while i + 64 <= len {
414            let input0 = _mm256_loadu_si256(ptr.add(i) as *const _);
415            let input1 = _mm256_loadu_si256(ptr.add(i + 32) as *const _);
416
417            let lo0 = _mm256_and_si256(input0, lo_mask);
418            let hi0 = _mm256_and_si256(_mm256_srli_epi16(input0, 4), lo_mask);
419            let lo1 = _mm256_and_si256(input1, lo_mask);
420            let hi1 = _mm256_and_si256(_mm256_srli_epi16(input1, 4), lo_mask);
421
422            let mut r0 = _mm256_setzero_si256();
423            let mut r1 = _mm256_setzero_si256();
424
425            macro_rules! do_nibble2 {
426                ($h:expr) => {
427                    let h_val = _mm256_set1_epi8($h as i8);
428                    let m0 = _mm256_cmpeq_epi8(hi0, h_val);
429                    let l0 = _mm256_shuffle_epi8(lut[$h], lo0);
430                    r0 = _mm256_or_si256(r0, _mm256_and_si256(m0, l0));
431                    let m1 = _mm256_cmpeq_epi8(hi1, h_val);
432                    let l1 = _mm256_shuffle_epi8(lut[$h], lo1);
433                    r1 = _mm256_or_si256(r1, _mm256_and_si256(m1, l1));
434                };
435            }
436            do_nibble2!(0);
437            do_nibble2!(1);
438            do_nibble2!(2);
439            do_nibble2!(3);
440            do_nibble2!(4);
441            do_nibble2!(5);
442            do_nibble2!(6);
443            do_nibble2!(7);
444            do_nibble2!(8);
445            do_nibble2!(9);
446            do_nibble2!(10);
447            do_nibble2!(11);
448            do_nibble2!(12);
449            do_nibble2!(13);
450            do_nibble2!(14);
451            do_nibble2!(15);
452
453            _mm256_storeu_si256(ptr.add(i) as *mut _, r0);
454            _mm256_storeu_si256(ptr.add(i + 32) as *mut _, r1);
455            i += 64;
456        }
457
458        // Remaining 32-byte chunk
459        if i + 32 <= len {
460            let input = _mm256_loadu_si256(ptr.add(i) as *const _);
461            let lo_nibble = _mm256_and_si256(input, lo_mask);
462            let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
463
464            let mut result = _mm256_setzero_si256();
465
466            macro_rules! do_nibble {
467                ($h:expr) => {
468                    let h_val = _mm256_set1_epi8($h as i8);
469                    let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
470                    let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
471                    result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
472                };
473            }
474            do_nibble!(0);
475            do_nibble!(1);
476            do_nibble!(2);
477            do_nibble!(3);
478            do_nibble!(4);
479            do_nibble!(5);
480            do_nibble!(6);
481            do_nibble!(7);
482            do_nibble!(8);
483            do_nibble!(9);
484            do_nibble!(10);
485            do_nibble!(11);
486            do_nibble!(12);
487            do_nibble!(13);
488            do_nibble!(14);
489            do_nibble!(15);
490
491            _mm256_storeu_si256(ptr.add(i) as *mut _, result);
492            i += 32;
493        }
494
495        // SSE/SSSE3 tail for remaining 16-byte chunk
496        if i + 16 <= len {
497            let lo_mask128 = _mm_set1_epi8(0x0F);
498
499            let mut lut128 = [_mm_setzero_si128(); 16];
500            for h in 0u8..16 {
501                lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
502            }
503
504            let input = _mm_loadu_si128(ptr.add(i) as *const _);
505            let lo_nib = _mm_and_si128(input, lo_mask128);
506            let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
507
508            let mut res = _mm_setzero_si128();
509            macro_rules! do_nibble128 {
510                ($h:expr) => {
511                    let h_val = _mm_set1_epi8($h as i8);
512                    let mask = _mm_cmpeq_epi8(hi_nib, h_val);
513                    let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
514                    res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
515                };
516            }
517            do_nibble128!(0);
518            do_nibble128!(1);
519            do_nibble128!(2);
520            do_nibble128!(3);
521            do_nibble128!(4);
522            do_nibble128!(5);
523            do_nibble128!(6);
524            do_nibble128!(7);
525            do_nibble128!(8);
526            do_nibble128!(9);
527            do_nibble128!(10);
528            do_nibble128!(11);
529            do_nibble128!(12);
530            do_nibble128!(13);
531            do_nibble128!(14);
532            do_nibble128!(15);
533
534            _mm_storeu_si128(ptr.add(i) as *mut _, res);
535            i += 16;
536        }
537
538        // Scalar tail
539        while i < len {
540            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
541            i += 1;
542        }
543    }
544}
545
546#[cfg(target_arch = "x86_64")]
547#[target_feature(enable = "ssse3")]
548unsafe fn translate_inplace_ssse3_table(data: &mut [u8], table: &[u8; 256]) {
549    use std::arch::x86_64::*;
550
551    unsafe {
552        let len = data.len();
553        let ptr = data.as_mut_ptr();
554
555        // Pre-build 16 lookup vectors for pshufb
556        let mut lut = [_mm_setzero_si128(); 16];
557        for h in 0u8..16 {
558            let base = (h as usize) * 16;
559            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
560            lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
561        }
562
563        let lo_mask = _mm_set1_epi8(0x0F);
564
565        let mut i = 0;
566        while i + 16 <= len {
567            let input = _mm_loadu_si128(ptr.add(i) as *const _);
568            let lo_nibble = _mm_and_si128(input, lo_mask);
569            let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
570
571            let mut result = _mm_setzero_si128();
572
573            macro_rules! do_nibble {
574                ($h:expr) => {
575                    let h_val = _mm_set1_epi8($h as i8);
576                    let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
577                    let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
578                    result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
579                };
580            }
581            do_nibble!(0);
582            do_nibble!(1);
583            do_nibble!(2);
584            do_nibble!(3);
585            do_nibble!(4);
586            do_nibble!(5);
587            do_nibble!(6);
588            do_nibble!(7);
589            do_nibble!(8);
590            do_nibble!(9);
591            do_nibble!(10);
592            do_nibble!(11);
593            do_nibble!(12);
594            do_nibble!(13);
595            do_nibble!(14);
596            do_nibble!(15);
597
598            _mm_storeu_si128(ptr.add(i) as *mut _, result);
599            i += 16;
600        }
601
602        // Scalar tail
603        while i < len {
604            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
605            i += 1;
606        }
607    }
608}
609
610/// Translate bytes from source to destination using a 256-byte lookup table.
611/// On x86_64 with SSSE3+, uses SIMD pshufb-based nibble decomposition.
612#[inline(always)]
613fn translate_to(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
614    debug_assert!(dst.len() >= src.len());
615    #[cfg(target_arch = "x86_64")]
616    {
617        let level = get_simd_level();
618        if level >= 3 {
619            // Use nontemporal stores when dst is 32-byte aligned (large Vec allocations)
620            if dst.as_ptr() as usize & 31 == 0 {
621                unsafe { translate_to_avx2_table_nt(src, dst, table) };
622            } else {
623                unsafe { translate_to_avx2_table(src, dst, table) };
624            }
625            return;
626        }
627        if level >= 2 {
628            unsafe { translate_to_ssse3_table(src, dst, table) };
629            return;
630        }
631    }
632    translate_to_scalar(src, dst, table);
633}
634
635/// Scalar fallback for translate_to.
636#[cfg(not(target_arch = "aarch64"))]
637#[inline(always)]
638fn translate_to_scalar(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
639    unsafe {
640        let sp = src.as_ptr();
641        let dp = dst.as_mut_ptr();
642        let len = src.len();
643        let mut i = 0;
644        while i + 8 <= len {
645            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
646            *dp.add(i + 1) = *table.get_unchecked(*sp.add(i + 1) as usize);
647            *dp.add(i + 2) = *table.get_unchecked(*sp.add(i + 2) as usize);
648            *dp.add(i + 3) = *table.get_unchecked(*sp.add(i + 3) as usize);
649            *dp.add(i + 4) = *table.get_unchecked(*sp.add(i + 4) as usize);
650            *dp.add(i + 5) = *table.get_unchecked(*sp.add(i + 5) as usize);
651            *dp.add(i + 6) = *table.get_unchecked(*sp.add(i + 6) as usize);
652            *dp.add(i + 7) = *table.get_unchecked(*sp.add(i + 7) as usize);
653            i += 8;
654        }
655        while i < len {
656            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
657            i += 1;
658        }
659    }
660}
661
662/// ARM64 NEON table-lookup translate_to using nibble decomposition.
663#[cfg(target_arch = "aarch64")]
664#[inline(always)]
665fn translate_to_scalar(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
666    unsafe { translate_to_neon_table(src, dst, table) };
667}
668
669#[cfg(target_arch = "aarch64")]
670#[target_feature(enable = "neon")]
671unsafe fn translate_to_neon_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
672    use std::arch::aarch64::*;
673
674    unsafe {
675        let len = src.len();
676        let sp = src.as_ptr();
677        let dp = dst.as_mut_ptr();
678
679        let mut lut: [uint8x16_t; 16] = [vdupq_n_u8(0); 16];
680        for h in 0u8..16 {
681            lut[h as usize] = vld1q_u8(table.as_ptr().add((h as usize) * 16));
682        }
683
684        let lo_mask = vdupq_n_u8(0x0F);
685        let mut i = 0;
686
687        while i + 16 <= len {
688            let input = vld1q_u8(sp.add(i));
689            let lo_nibble = vandq_u8(input, lo_mask);
690            let hi_nibble = vandq_u8(vshrq_n_u8(input, 4), lo_mask);
691
692            let mut result = vdupq_n_u8(0);
693            macro_rules! do_nibble {
694                ($h:expr) => {
695                    let h_val = vdupq_n_u8($h);
696                    let mask = vceqq_u8(hi_nibble, h_val);
697                    let looked_up = vqtbl1q_u8(lut[$h as usize], lo_nibble);
698                    result = vorrq_u8(result, vandq_u8(mask, looked_up));
699                };
700            }
701            do_nibble!(0);
702            do_nibble!(1);
703            do_nibble!(2);
704            do_nibble!(3);
705            do_nibble!(4);
706            do_nibble!(5);
707            do_nibble!(6);
708            do_nibble!(7);
709            do_nibble!(8);
710            do_nibble!(9);
711            do_nibble!(10);
712            do_nibble!(11);
713            do_nibble!(12);
714            do_nibble!(13);
715            do_nibble!(14);
716            do_nibble!(15);
717
718            vst1q_u8(dp.add(i), result);
719            i += 16;
720        }
721
722        while i < len {
723            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
724            i += 1;
725        }
726    }
727}
728
729#[cfg(target_arch = "x86_64")]
730#[target_feature(enable = "avx2")]
731unsafe fn translate_to_avx2_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
732    use std::arch::x86_64::*;
733
734    unsafe {
735        let len = src.len();
736        let sp = src.as_ptr();
737        let dp = dst.as_mut_ptr();
738
739        // Pre-build 16 lookup vectors
740        let mut lut = [_mm256_setzero_si256(); 16];
741        for h in 0u8..16 {
742            let base = (h as usize) * 16;
743            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
744            let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
745            lut[h as usize] = _mm256_broadcastsi128_si256(row128);
746        }
747
748        let lo_mask = _mm256_set1_epi8(0x0F);
749
750        let mut i = 0;
751
752        // 2x unrolled: process 64 bytes per iteration for better ILP
753        while i + 64 <= len {
754            let input0 = _mm256_loadu_si256(sp.add(i) as *const _);
755            let input1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
756
757            let lo0 = _mm256_and_si256(input0, lo_mask);
758            let hi0 = _mm256_and_si256(_mm256_srli_epi16(input0, 4), lo_mask);
759            let lo1 = _mm256_and_si256(input1, lo_mask);
760            let hi1 = _mm256_and_si256(_mm256_srli_epi16(input1, 4), lo_mask);
761
762            let mut r0 = _mm256_setzero_si256();
763            let mut r1 = _mm256_setzero_si256();
764
765            macro_rules! do_nibble2 {
766                ($h:expr) => {
767                    let h_val = _mm256_set1_epi8($h as i8);
768                    let m0 = _mm256_cmpeq_epi8(hi0, h_val);
769                    let l0 = _mm256_shuffle_epi8(lut[$h], lo0);
770                    r0 = _mm256_or_si256(r0, _mm256_and_si256(m0, l0));
771                    let m1 = _mm256_cmpeq_epi8(hi1, h_val);
772                    let l1 = _mm256_shuffle_epi8(lut[$h], lo1);
773                    r1 = _mm256_or_si256(r1, _mm256_and_si256(m1, l1));
774                };
775            }
776            do_nibble2!(0);
777            do_nibble2!(1);
778            do_nibble2!(2);
779            do_nibble2!(3);
780            do_nibble2!(4);
781            do_nibble2!(5);
782            do_nibble2!(6);
783            do_nibble2!(7);
784            do_nibble2!(8);
785            do_nibble2!(9);
786            do_nibble2!(10);
787            do_nibble2!(11);
788            do_nibble2!(12);
789            do_nibble2!(13);
790            do_nibble2!(14);
791            do_nibble2!(15);
792
793            _mm256_storeu_si256(dp.add(i) as *mut _, r0);
794            _mm256_storeu_si256(dp.add(i + 32) as *mut _, r1);
795            i += 64;
796        }
797
798        // Remaining 32-byte chunk
799        if i + 32 <= len {
800            let input = _mm256_loadu_si256(sp.add(i) as *const _);
801            let lo_nibble = _mm256_and_si256(input, lo_mask);
802            let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
803
804            let mut result = _mm256_setzero_si256();
805
806            macro_rules! do_nibble {
807                ($h:expr) => {
808                    let h_val = _mm256_set1_epi8($h as i8);
809                    let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
810                    let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
811                    result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
812                };
813            }
814            do_nibble!(0);
815            do_nibble!(1);
816            do_nibble!(2);
817            do_nibble!(3);
818            do_nibble!(4);
819            do_nibble!(5);
820            do_nibble!(6);
821            do_nibble!(7);
822            do_nibble!(8);
823            do_nibble!(9);
824            do_nibble!(10);
825            do_nibble!(11);
826            do_nibble!(12);
827            do_nibble!(13);
828            do_nibble!(14);
829            do_nibble!(15);
830
831            _mm256_storeu_si256(dp.add(i) as *mut _, result);
832            i += 32;
833        }
834
835        // SSSE3 tail for remaining 16-byte chunk
836        if i + 16 <= len {
837            let lo_mask128 = _mm_set1_epi8(0x0F);
838            let mut lut128 = [_mm_setzero_si128(); 16];
839            for h in 0u8..16 {
840                lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
841            }
842
843            let input = _mm_loadu_si128(sp.add(i) as *const _);
844            let lo_nib = _mm_and_si128(input, lo_mask128);
845            let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
846
847            let mut res = _mm_setzero_si128();
848            macro_rules! do_nibble128 {
849                ($h:expr) => {
850                    let h_val = _mm_set1_epi8($h as i8);
851                    let mask = _mm_cmpeq_epi8(hi_nib, h_val);
852                    let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
853                    res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
854                };
855            }
856            do_nibble128!(0);
857            do_nibble128!(1);
858            do_nibble128!(2);
859            do_nibble128!(3);
860            do_nibble128!(4);
861            do_nibble128!(5);
862            do_nibble128!(6);
863            do_nibble128!(7);
864            do_nibble128!(8);
865            do_nibble128!(9);
866            do_nibble128!(10);
867            do_nibble128!(11);
868            do_nibble128!(12);
869            do_nibble128!(13);
870            do_nibble128!(14);
871            do_nibble128!(15);
872
873            _mm_storeu_si128(dp.add(i) as *mut _, res);
874            i += 16;
875        }
876
877        // Scalar tail
878        while i < len {
879            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
880            i += 1;
881        }
882    }
883}
884
885/// Nontemporal variant of translate_to_avx2_table: uses _mm256_stream_si256 for stores.
886/// Avoids RFO cache traffic for the destination buffer in streaming translate operations.
887#[cfg(target_arch = "x86_64")]
888#[target_feature(enable = "avx2")]
889unsafe fn translate_to_avx2_table_nt(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
890    use std::arch::x86_64::*;
891
892    unsafe {
893        let len = src.len();
894        let sp = src.as_ptr();
895        let dp = dst.as_mut_ptr();
896
897        // Pre-build 16 lookup vectors
898        let mut lut = [_mm256_setzero_si256(); 16];
899        for h in 0u8..16 {
900            let base = (h as usize) * 16;
901            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
902            let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
903            lut[h as usize] = _mm256_broadcastsi128_si256(row128);
904        }
905
906        let lo_mask = _mm256_set1_epi8(0x0F);
907        let mut i = 0;
908
909        // 2x unrolled with nontemporal stores
910        while i + 64 <= len {
911            let input0 = _mm256_loadu_si256(sp.add(i) as *const _);
912            let input1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
913
914            let lo0 = _mm256_and_si256(input0, lo_mask);
915            let hi0 = _mm256_and_si256(_mm256_srli_epi16(input0, 4), lo_mask);
916            let lo1 = _mm256_and_si256(input1, lo_mask);
917            let hi1 = _mm256_and_si256(_mm256_srli_epi16(input1, 4), lo_mask);
918
919            let mut r0 = _mm256_setzero_si256();
920            let mut r1 = _mm256_setzero_si256();
921
922            macro_rules! do_nibble2 {
923                ($h:expr) => {
924                    let h_val = _mm256_set1_epi8($h as i8);
925                    let m0 = _mm256_cmpeq_epi8(hi0, h_val);
926                    let l0 = _mm256_shuffle_epi8(lut[$h], lo0);
927                    r0 = _mm256_or_si256(r0, _mm256_and_si256(m0, l0));
928                    let m1 = _mm256_cmpeq_epi8(hi1, h_val);
929                    let l1 = _mm256_shuffle_epi8(lut[$h], lo1);
930                    r1 = _mm256_or_si256(r1, _mm256_and_si256(m1, l1));
931                };
932            }
933            do_nibble2!(0);
934            do_nibble2!(1);
935            do_nibble2!(2);
936            do_nibble2!(3);
937            do_nibble2!(4);
938            do_nibble2!(5);
939            do_nibble2!(6);
940            do_nibble2!(7);
941            do_nibble2!(8);
942            do_nibble2!(9);
943            do_nibble2!(10);
944            do_nibble2!(11);
945            do_nibble2!(12);
946            do_nibble2!(13);
947            do_nibble2!(14);
948            do_nibble2!(15);
949
950            _mm256_stream_si256(dp.add(i) as *mut _, r0);
951            _mm256_stream_si256(dp.add(i + 32) as *mut _, r1);
952            i += 64;
953        }
954
955        // Remaining 32-byte chunk
956        if i + 32 <= len {
957            let input = _mm256_loadu_si256(sp.add(i) as *const _);
958            let lo_nibble = _mm256_and_si256(input, lo_mask);
959            let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
960
961            let mut result = _mm256_setzero_si256();
962            macro_rules! do_nibble {
963                ($h:expr) => {
964                    let h_val = _mm256_set1_epi8($h as i8);
965                    let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
966                    let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
967                    result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
968                };
969            }
970            do_nibble!(0);
971            do_nibble!(1);
972            do_nibble!(2);
973            do_nibble!(3);
974            do_nibble!(4);
975            do_nibble!(5);
976            do_nibble!(6);
977            do_nibble!(7);
978            do_nibble!(8);
979            do_nibble!(9);
980            do_nibble!(10);
981            do_nibble!(11);
982            do_nibble!(12);
983            do_nibble!(13);
984            do_nibble!(14);
985            do_nibble!(15);
986
987            _mm256_stream_si256(dp.add(i) as *mut _, result);
988            i += 32;
989        }
990
991        // SSSE3 tail for remaining 16-byte chunk (regular store)
992        if i + 16 <= len {
993            let lo_mask128 = _mm_set1_epi8(0x0F);
994            let mut lut128 = [_mm_setzero_si128(); 16];
995            for h in 0u8..16 {
996                lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
997            }
998
999            let input = _mm_loadu_si128(sp.add(i) as *const _);
1000            let lo_nib = _mm_and_si128(input, lo_mask128);
1001            let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
1002
1003            let mut res = _mm_setzero_si128();
1004            macro_rules! do_nibble128 {
1005                ($h:expr) => {
1006                    let h_val = _mm_set1_epi8($h as i8);
1007                    let mask = _mm_cmpeq_epi8(hi_nib, h_val);
1008                    let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
1009                    res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
1010                };
1011            }
1012            do_nibble128!(0);
1013            do_nibble128!(1);
1014            do_nibble128!(2);
1015            do_nibble128!(3);
1016            do_nibble128!(4);
1017            do_nibble128!(5);
1018            do_nibble128!(6);
1019            do_nibble128!(7);
1020            do_nibble128!(8);
1021            do_nibble128!(9);
1022            do_nibble128!(10);
1023            do_nibble128!(11);
1024            do_nibble128!(12);
1025            do_nibble128!(13);
1026            do_nibble128!(14);
1027            do_nibble128!(15);
1028
1029            _mm_storeu_si128(dp.add(i) as *mut _, res);
1030            i += 16;
1031        }
1032
1033        // Scalar tail
1034        while i < len {
1035            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
1036            i += 1;
1037        }
1038
1039        // Fence: ensure nontemporal stores are visible before write() syscall
1040        _mm_sfence();
1041    }
1042}
1043
1044#[cfg(target_arch = "x86_64")]
1045#[target_feature(enable = "ssse3")]
1046unsafe fn translate_to_ssse3_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
1047    use std::arch::x86_64::*;
1048
1049    unsafe {
1050        let len = src.len();
1051        let sp = src.as_ptr();
1052        let dp = dst.as_mut_ptr();
1053
1054        let mut lut = [_mm_setzero_si128(); 16];
1055        for h in 0u8..16 {
1056            let base = (h as usize) * 16;
1057            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
1058            lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
1059        }
1060
1061        let lo_mask = _mm_set1_epi8(0x0F);
1062
1063        let mut i = 0;
1064        while i + 16 <= len {
1065            let input = _mm_loadu_si128(sp.add(i) as *const _);
1066            let lo_nibble = _mm_and_si128(input, lo_mask);
1067            let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
1068
1069            let mut result = _mm_setzero_si128();
1070
1071            macro_rules! do_nibble {
1072                ($h:expr) => {
1073                    let h_val = _mm_set1_epi8($h as i8);
1074                    let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
1075                    let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
1076                    result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
1077                };
1078            }
1079            do_nibble!(0);
1080            do_nibble!(1);
1081            do_nibble!(2);
1082            do_nibble!(3);
1083            do_nibble!(4);
1084            do_nibble!(5);
1085            do_nibble!(6);
1086            do_nibble!(7);
1087            do_nibble!(8);
1088            do_nibble!(9);
1089            do_nibble!(10);
1090            do_nibble!(11);
1091            do_nibble!(12);
1092            do_nibble!(13);
1093            do_nibble!(14);
1094            do_nibble!(15);
1095
1096            _mm_storeu_si128(dp.add(i) as *mut _, result);
1097            i += 16;
1098        }
1099
1100        // Scalar tail
1101        while i < len {
1102            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
1103            i += 1;
1104        }
1105    }
1106}
1107
1108// ============================================================================
1109// SIMD range translation (x86_64)
1110// ============================================================================
1111
1112/// Detect if the translate table is a single contiguous range with constant offset.
1113/// Returns Some((lo, hi, offset)) if all non-identity entries form [lo..=hi] with
1114/// table[i] = i + offset for all i in [lo, hi].
1115#[inline]
1116fn detect_range_offset(table: &[u8; 256]) -> Option<(u8, u8, i8)> {
1117    let mut lo: Option<u8> = None;
1118    let mut hi = 0u8;
1119    let mut offset = 0i16;
1120
1121    for i in 0..256 {
1122        if table[i] != i as u8 {
1123            let diff = table[i] as i16 - i as i16;
1124            match lo {
1125                None => {
1126                    lo = Some(i as u8);
1127                    hi = i as u8;
1128                    offset = diff;
1129                }
1130                Some(_) => {
1131                    if diff != offset || i as u8 != hi.wrapping_add(1) {
1132                        return None;
1133                    }
1134                    hi = i as u8;
1135                }
1136            }
1137        }
1138    }
1139
1140    lo.map(|l| (l, hi, offset as i8))
1141}
1142
1143/// Detect if the translate table maps a contiguous range [lo..=hi] to a single constant byte,
1144/// and all other bytes are identity. This covers cases like `tr '\000-\037' 'X'` where
1145/// a range maps to one replacement character.
1146/// Returns Some((lo, hi, replacement)) if the pattern matches.
1147#[inline]
1148fn detect_range_to_constant(table: &[u8; 256]) -> Option<(u8, u8, u8)> {
1149    let mut lo: Option<u8> = None;
1150    let mut hi = 0u8;
1151    let mut replacement = 0u8;
1152
1153    for i in 0..256 {
1154        if table[i] != i as u8 {
1155            match lo {
1156                None => {
1157                    lo = Some(i as u8);
1158                    hi = i as u8;
1159                    replacement = table[i];
1160                }
1161                Some(_) => {
1162                    if table[i] != replacement || i as u8 != hi.wrapping_add(1) {
1163                        return None;
1164                    }
1165                    hi = i as u8;
1166                }
1167            }
1168        }
1169    }
1170
1171    lo.map(|l| (l, hi, replacement))
1172}
1173
1174/// SIMD-accelerated range-to-constant translation.
1175/// For tables where a contiguous range [lo..=hi] maps to a single byte, and all
1176/// other bytes are identity. Uses vectorized range check + blend (5 SIMD ops per
1177/// 32 bytes with AVX2, vs 48 for general nibble decomposition).
1178#[cfg(target_arch = "x86_64")]
1179fn translate_range_to_constant_simd_inplace(data: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1180    if get_simd_level() >= 3 {
1181        unsafe { translate_range_to_constant_avx2_inplace(data, lo, hi, replacement) };
1182    } else {
1183        unsafe { translate_range_to_constant_sse2_inplace(data, lo, hi, replacement) };
1184    }
1185}
1186
1187#[cfg(target_arch = "x86_64")]
1188#[target_feature(enable = "avx2")]
1189unsafe fn translate_range_to_constant_avx2_inplace(
1190    data: &mut [u8],
1191    lo: u8,
1192    hi: u8,
1193    replacement: u8,
1194) {
1195    use std::arch::x86_64::*;
1196
1197    unsafe {
1198        let range = hi - lo;
1199        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1200        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1201        let repl_v = _mm256_set1_epi8(replacement as i8);
1202        let zero = _mm256_setzero_si256();
1203
1204        let len = data.len();
1205        let ptr = data.as_mut_ptr();
1206        let mut i = 0;
1207
1208        // 2x unrolled: process 64 bytes per iteration for better ILP
1209        while i + 64 <= len {
1210            let in0 = _mm256_loadu_si256(ptr.add(i) as *const _);
1211            let in1 = _mm256_loadu_si256(ptr.add(i + 32) as *const _);
1212            let bi0 = _mm256_add_epi8(in0, bias_v);
1213            let bi1 = _mm256_add_epi8(in1, bias_v);
1214            let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1215            let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1216            let ir0 = _mm256_cmpeq_epi8(gt0, zero);
1217            let ir1 = _mm256_cmpeq_epi8(gt1, zero);
1218            let r0 = _mm256_blendv_epi8(in0, repl_v, ir0);
1219            let r1 = _mm256_blendv_epi8(in1, repl_v, ir1);
1220            _mm256_storeu_si256(ptr.add(i) as *mut _, r0);
1221            _mm256_storeu_si256(ptr.add(i + 32) as *mut _, r1);
1222            i += 64;
1223        }
1224
1225        // Remaining 32-byte chunk
1226        if i + 32 <= len {
1227            let input = _mm256_loadu_si256(ptr.add(i) as *const _);
1228            let biased = _mm256_add_epi8(input, bias_v);
1229            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1230            let in_range = _mm256_cmpeq_epi8(gt, zero);
1231            let result = _mm256_blendv_epi8(input, repl_v, in_range);
1232            _mm256_storeu_si256(ptr.add(i) as *mut _, result);
1233            i += 32;
1234        }
1235
1236        if i + 16 <= len {
1237            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1238            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1239            let repl_v128 = _mm_set1_epi8(replacement as i8);
1240            let zero128 = _mm_setzero_si128();
1241
1242            let input = _mm_loadu_si128(ptr.add(i) as *const _);
1243            let biased = _mm_add_epi8(input, bias_v128);
1244            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1245            let in_range = _mm_cmpeq_epi8(gt, zero128);
1246            let result = _mm_blendv_epi8(input, repl_v128, in_range);
1247            _mm_storeu_si128(ptr.add(i) as *mut _, result);
1248            i += 16;
1249        }
1250
1251        while i < len {
1252            let b = *ptr.add(i);
1253            *ptr.add(i) = if b >= lo && b <= hi { replacement } else { b };
1254            i += 1;
1255        }
1256    }
1257}
1258
1259#[cfg(target_arch = "x86_64")]
1260#[target_feature(enable = "sse2")]
1261unsafe fn translate_range_to_constant_sse2_inplace(
1262    data: &mut [u8],
1263    lo: u8,
1264    hi: u8,
1265    replacement: u8,
1266) {
1267    use std::arch::x86_64::*;
1268
1269    unsafe {
1270        let range = hi - lo;
1271        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1272        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1273        let repl_v = _mm_set1_epi8(replacement as i8);
1274        let zero = _mm_setzero_si128();
1275
1276        let len = data.len();
1277        let ptr = data.as_mut_ptr();
1278        let mut i = 0;
1279
1280        while i + 16 <= len {
1281            let input = _mm_loadu_si128(ptr.add(i) as *const _);
1282            let biased = _mm_add_epi8(input, bias_v);
1283            let gt = _mm_cmpgt_epi8(biased, threshold_v);
1284            // in_range mask: 0xFF where in range, 0x00 where not
1285            let in_range = _mm_cmpeq_epi8(gt, zero);
1286            // SSE2 blendv: (repl & mask) | (input & ~mask)
1287            let result = _mm_or_si128(
1288                _mm_and_si128(in_range, repl_v),
1289                _mm_andnot_si128(in_range, input),
1290            );
1291            _mm_storeu_si128(ptr.add(i) as *mut _, result);
1292            i += 16;
1293        }
1294
1295        while i < len {
1296            let b = *ptr.add(i);
1297            *ptr.add(i) = if b >= lo && b <= hi { replacement } else { b };
1298            i += 1;
1299        }
1300    }
1301}
1302
1303#[cfg(target_arch = "aarch64")]
1304fn translate_range_to_constant_simd_inplace(data: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1305    unsafe { translate_range_to_constant_neon_inplace(data, lo, hi, replacement) };
1306}
1307
1308#[cfg(target_arch = "aarch64")]
1309#[target_feature(enable = "neon")]
1310unsafe fn translate_range_to_constant_neon_inplace(
1311    data: &mut [u8],
1312    lo: u8,
1313    hi: u8,
1314    replacement: u8,
1315) {
1316    use std::arch::aarch64::*;
1317
1318    unsafe {
1319        let len = data.len();
1320        let ptr = data.as_mut_ptr();
1321        let lo_v = vdupq_n_u8(lo);
1322        let hi_v = vdupq_n_u8(hi);
1323        let repl_v = vdupq_n_u8(replacement);
1324        let mut i = 0;
1325
1326        while i + 32 <= len {
1327            let in0 = vld1q_u8(ptr.add(i));
1328            let in1 = vld1q_u8(ptr.add(i + 16));
1329            let ge0 = vcgeq_u8(in0, lo_v);
1330            let le0 = vcleq_u8(in0, hi_v);
1331            let mask0 = vandq_u8(ge0, le0);
1332            let ge1 = vcgeq_u8(in1, lo_v);
1333            let le1 = vcleq_u8(in1, hi_v);
1334            let mask1 = vandq_u8(ge1, le1);
1335            // bsl: select repl where mask, keep input where not
1336            vst1q_u8(ptr.add(i), vbslq_u8(mask0, repl_v, in0));
1337            vst1q_u8(ptr.add(i + 16), vbslq_u8(mask1, repl_v, in1));
1338            i += 32;
1339        }
1340
1341        if i + 16 <= len {
1342            let input = vld1q_u8(ptr.add(i));
1343            let ge = vcgeq_u8(input, lo_v);
1344            let le = vcleq_u8(input, hi_v);
1345            let mask = vandq_u8(ge, le);
1346            vst1q_u8(ptr.add(i), vbslq_u8(mask, repl_v, input));
1347            i += 16;
1348        }
1349
1350        while i < len {
1351            let b = *ptr.add(i);
1352            *ptr.add(i) = if b >= lo && b <= hi { replacement } else { b };
1353            i += 1;
1354        }
1355    }
1356}
1357
1358#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
1359fn translate_range_to_constant_simd_inplace(data: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1360    for b in data.iter_mut() {
1361        if *b >= lo && *b <= hi {
1362            *b = replacement;
1363        }
1364    }
1365}
1366
1367/// SIMD range-to-constant translation from src to dst (no intermediate copy needed).
1368/// Reads from src, writes translated result to dst in a single pass.
1369#[cfg(target_arch = "x86_64")]
1370fn translate_range_to_constant_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1371    if get_simd_level() >= 3 {
1372        unsafe { translate_range_to_constant_avx2(src, dst, lo, hi, replacement) };
1373    } else {
1374        unsafe { translate_range_to_constant_sse2(src, dst, lo, hi, replacement) };
1375    }
1376}
1377
1378#[cfg(target_arch = "aarch64")]
1379fn translate_range_to_constant_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1380    unsafe { translate_range_to_constant_neon(src, dst, lo, hi, replacement) };
1381}
1382
1383#[cfg(target_arch = "aarch64")]
1384#[target_feature(enable = "neon")]
1385unsafe fn translate_range_to_constant_neon(
1386    src: &[u8],
1387    dst: &mut [u8],
1388    lo: u8,
1389    hi: u8,
1390    replacement: u8,
1391) {
1392    use std::arch::aarch64::*;
1393
1394    unsafe {
1395        let len = src.len();
1396        let sp = src.as_ptr();
1397        let dp = dst.as_mut_ptr();
1398        let lo_v = vdupq_n_u8(lo);
1399        let hi_v = vdupq_n_u8(hi);
1400        let repl_v = vdupq_n_u8(replacement);
1401        let mut i = 0;
1402
1403        while i + 32 <= len {
1404            let in0 = vld1q_u8(sp.add(i));
1405            let in1 = vld1q_u8(sp.add(i + 16));
1406            let mask0 = vandq_u8(vcgeq_u8(in0, lo_v), vcleq_u8(in0, hi_v));
1407            let mask1 = vandq_u8(vcgeq_u8(in1, lo_v), vcleq_u8(in1, hi_v));
1408            vst1q_u8(dp.add(i), vbslq_u8(mask0, repl_v, in0));
1409            vst1q_u8(dp.add(i + 16), vbslq_u8(mask1, repl_v, in1));
1410            i += 32;
1411        }
1412
1413        if i + 16 <= len {
1414            let input = vld1q_u8(sp.add(i));
1415            let mask = vandq_u8(vcgeq_u8(input, lo_v), vcleq_u8(input, hi_v));
1416            vst1q_u8(dp.add(i), vbslq_u8(mask, repl_v, input));
1417            i += 16;
1418        }
1419
1420        while i < len {
1421            let b = *sp.add(i);
1422            *dp.add(i) = if b >= lo && b <= hi { replacement } else { b };
1423            i += 1;
1424        }
1425    }
1426}
1427
1428#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
1429fn translate_range_to_constant_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1430    for (i, &b) in src.iter().enumerate() {
1431        unsafe {
1432            *dst.get_unchecked_mut(i) = if b >= lo && b <= hi { replacement } else { b };
1433        }
1434    }
1435}
1436
1437#[cfg(target_arch = "x86_64")]
1438#[target_feature(enable = "avx2")]
1439unsafe fn translate_range_to_constant_avx2(
1440    src: &[u8],
1441    dst: &mut [u8],
1442    lo: u8,
1443    hi: u8,
1444    replacement: u8,
1445) {
1446    use std::arch::x86_64::*;
1447    unsafe {
1448        let range = hi - lo;
1449        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1450        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1451        let repl_v = _mm256_set1_epi8(replacement as i8);
1452        let zero = _mm256_setzero_si256();
1453        let len = src.len();
1454        let sp = src.as_ptr();
1455        let dp = dst.as_mut_ptr();
1456        let mut i = 0;
1457        while i + 64 <= len {
1458            let in0 = _mm256_loadu_si256(sp.add(i) as *const _);
1459            let in1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
1460            let bi0 = _mm256_add_epi8(in0, bias_v);
1461            let bi1 = _mm256_add_epi8(in1, bias_v);
1462            let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1463            let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1464            let ir0 = _mm256_cmpeq_epi8(gt0, zero);
1465            let ir1 = _mm256_cmpeq_epi8(gt1, zero);
1466            let r0 = _mm256_blendv_epi8(in0, repl_v, ir0);
1467            let r1 = _mm256_blendv_epi8(in1, repl_v, ir1);
1468            _mm256_storeu_si256(dp.add(i) as *mut _, r0);
1469            _mm256_storeu_si256(dp.add(i + 32) as *mut _, r1);
1470            i += 64;
1471        }
1472        if i + 32 <= len {
1473            let input = _mm256_loadu_si256(sp.add(i) as *const _);
1474            let biased = _mm256_add_epi8(input, bias_v);
1475            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1476            let in_range = _mm256_cmpeq_epi8(gt, zero);
1477            let result = _mm256_blendv_epi8(input, repl_v, in_range);
1478            _mm256_storeu_si256(dp.add(i) as *mut _, result);
1479            i += 32;
1480        }
1481        while i < len {
1482            let b = *sp.add(i);
1483            *dp.add(i) = if b >= lo && b <= hi { replacement } else { b };
1484            i += 1;
1485        }
1486    }
1487}
1488
1489#[cfg(target_arch = "x86_64")]
1490#[target_feature(enable = "sse2")]
1491unsafe fn translate_range_to_constant_sse2(
1492    src: &[u8],
1493    dst: &mut [u8],
1494    lo: u8,
1495    hi: u8,
1496    replacement: u8,
1497) {
1498    use std::arch::x86_64::*;
1499    unsafe {
1500        let range = hi - lo;
1501        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1502        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1503        let repl_v = _mm_set1_epi8(replacement as i8);
1504        let zero = _mm_setzero_si128();
1505        let len = src.len();
1506        let sp = src.as_ptr();
1507        let dp = dst.as_mut_ptr();
1508        let mut i = 0;
1509        while i + 16 <= len {
1510            let input = _mm_loadu_si128(sp.add(i) as *const _);
1511            let biased = _mm_add_epi8(input, bias_v);
1512            let gt = _mm_cmpgt_epi8(biased, threshold_v);
1513            let in_range = _mm_cmpeq_epi8(gt, zero);
1514            let result = _mm_or_si128(
1515                _mm_and_si128(in_range, repl_v),
1516                _mm_andnot_si128(in_range, input),
1517            );
1518            _mm_storeu_si128(dp.add(i) as *mut _, result);
1519            i += 16;
1520        }
1521        while i < len {
1522            let b = *sp.add(i);
1523            *dp.add(i) = if b >= lo && b <= hi { replacement } else { b };
1524            i += 1;
1525        }
1526    }
1527}
1528
1529/// SIMD-accelerated range translation for mmap'd data.
1530/// For tables where only a contiguous range [lo..=hi] is translated by a constant offset,
1531/// uses AVX2 (32 bytes/iter) or SSE2 (16 bytes/iter) vectorized arithmetic.
1532/// When dst is 32-byte aligned (true for large Vec allocations from mmap), uses
1533/// nontemporal stores to bypass cache, avoiding read-for-ownership overhead and
1534/// reducing memory traffic by ~33% for streaming writes.
1535#[cfg(target_arch = "x86_64")]
1536fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1537    if get_simd_level() >= 3 {
1538        // Use nontemporal stores when dst is 32-byte aligned (typical for large allocs)
1539        if dst.as_ptr() as usize & 31 == 0 {
1540            unsafe { translate_range_avx2_nt(src, dst, lo, hi, offset) };
1541        } else {
1542            unsafe { translate_range_avx2(src, dst, lo, hi, offset) };
1543        }
1544    } else {
1545        unsafe { translate_range_sse2(src, dst, lo, hi, offset) };
1546    }
1547}
1548
1549#[cfg(target_arch = "x86_64")]
1550#[target_feature(enable = "avx2")]
1551unsafe fn translate_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1552    use std::arch::x86_64::*;
1553
1554    unsafe {
1555        let range = hi - lo;
1556        // Bias: shift range so lo maps to -128 (signed min).
1557        // For input in [lo, hi]: biased = input + (0x80 - lo) is in [-128, -128+range].
1558        // For input < lo: biased wraps to large positive (signed), > threshold.
1559        // For input > hi: biased > -128+range, > threshold.
1560        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1561        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1562        let offset_v = _mm256_set1_epi8(offset);
1563        let zero = _mm256_setzero_si256();
1564
1565        let len = src.len();
1566        let sp = src.as_ptr();
1567        let dp = dst.as_mut_ptr();
1568        let mut i = 0;
1569
1570        // 2x unrolled: process 64 bytes per iteration for better ILP.
1571        // Load/compute on the second vector while the first is in-flight.
1572        while i + 64 <= len {
1573            let in0 = _mm256_loadu_si256(sp.add(i) as *const _);
1574            let in1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
1575            let bi0 = _mm256_add_epi8(in0, bias_v);
1576            let bi1 = _mm256_add_epi8(in1, bias_v);
1577            let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1578            let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1579            let m0 = _mm256_cmpeq_epi8(gt0, zero);
1580            let m1 = _mm256_cmpeq_epi8(gt1, zero);
1581            let om0 = _mm256_and_si256(m0, offset_v);
1582            let om1 = _mm256_and_si256(m1, offset_v);
1583            let r0 = _mm256_add_epi8(in0, om0);
1584            let r1 = _mm256_add_epi8(in1, om1);
1585            _mm256_storeu_si256(dp.add(i) as *mut _, r0);
1586            _mm256_storeu_si256(dp.add(i + 32) as *mut _, r1);
1587            i += 64;
1588        }
1589
1590        // Remaining 32-byte chunk
1591        if i + 32 <= len {
1592            let input = _mm256_loadu_si256(sp.add(i) as *const _);
1593            let biased = _mm256_add_epi8(input, bias_v);
1594            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1595            let mask = _mm256_cmpeq_epi8(gt, zero);
1596            let offset_masked = _mm256_and_si256(mask, offset_v);
1597            let result = _mm256_add_epi8(input, offset_masked);
1598            _mm256_storeu_si256(dp.add(i) as *mut _, result);
1599            i += 32;
1600        }
1601
1602        // SSE2 tail for 16-byte remainder
1603        if i + 16 <= len {
1604            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1605            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1606            let offset_v128 = _mm_set1_epi8(offset);
1607            let zero128 = _mm_setzero_si128();
1608
1609            let input = _mm_loadu_si128(sp.add(i) as *const _);
1610            let biased = _mm_add_epi8(input, bias_v128);
1611            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1612            let mask = _mm_cmpeq_epi8(gt, zero128);
1613            let offset_masked = _mm_and_si128(mask, offset_v128);
1614            let result = _mm_add_epi8(input, offset_masked);
1615            _mm_storeu_si128(dp.add(i) as *mut _, result);
1616            i += 16;
1617        }
1618
1619        // Scalar tail
1620        while i < len {
1621            let b = *sp.add(i);
1622            *dp.add(i) = if b >= lo && b <= hi {
1623                b.wrapping_add(offset as u8)
1624            } else {
1625                b
1626            };
1627            i += 1;
1628        }
1629    }
1630}
1631
1632/// Nontemporal variant of translate_range_avx2: uses _mm256_stream_si256 for stores.
1633/// This bypasses the cache for writes, avoiding read-for-ownership (RFO) traffic on
1634/// the destination buffer. For streaming translate (src → dst, dst not read again),
1635/// this reduces memory traffic by ~33% (10MB input: 20MB vs 30MB total traffic).
1636/// Requires dst to be 32-byte aligned (guaranteed for large Vec/mmap allocations).
1637#[cfg(target_arch = "x86_64")]
1638#[target_feature(enable = "avx2")]
1639unsafe fn translate_range_avx2_nt(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1640    use std::arch::x86_64::*;
1641
1642    unsafe {
1643        let range = hi - lo;
1644        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1645        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1646        let offset_v = _mm256_set1_epi8(offset);
1647        let zero = _mm256_setzero_si256();
1648
1649        let len = src.len();
1650        let sp = src.as_ptr();
1651        let dp = dst.as_mut_ptr();
1652        let mut i = 0;
1653
1654        // 2x unrolled with nontemporal stores
1655        while i + 64 <= len {
1656            let in0 = _mm256_loadu_si256(sp.add(i) as *const _);
1657            let in1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
1658            let bi0 = _mm256_add_epi8(in0, bias_v);
1659            let bi1 = _mm256_add_epi8(in1, bias_v);
1660            let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1661            let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1662            let m0 = _mm256_cmpeq_epi8(gt0, zero);
1663            let m1 = _mm256_cmpeq_epi8(gt1, zero);
1664            let om0 = _mm256_and_si256(m0, offset_v);
1665            let om1 = _mm256_and_si256(m1, offset_v);
1666            let r0 = _mm256_add_epi8(in0, om0);
1667            let r1 = _mm256_add_epi8(in1, om1);
1668            _mm256_stream_si256(dp.add(i) as *mut _, r0);
1669            _mm256_stream_si256(dp.add(i + 32) as *mut _, r1);
1670            i += 64;
1671        }
1672
1673        // Remaining 32-byte chunk (still nontemporal if aligned)
1674        if i + 32 <= len {
1675            let input = _mm256_loadu_si256(sp.add(i) as *const _);
1676            let biased = _mm256_add_epi8(input, bias_v);
1677            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1678            let mask = _mm256_cmpeq_epi8(gt, zero);
1679            let offset_masked = _mm256_and_si256(mask, offset_v);
1680            let result = _mm256_add_epi8(input, offset_masked);
1681            _mm256_stream_si256(dp.add(i) as *mut _, result);
1682            i += 32;
1683        }
1684
1685        // SSE2 tail for 16-byte remainder (regular store — only 16 bytes)
1686        if i + 16 <= len {
1687            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1688            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1689            let offset_v128 = _mm_set1_epi8(offset);
1690            let zero128 = _mm_setzero_si128();
1691
1692            let input = _mm_loadu_si128(sp.add(i) as *const _);
1693            let biased = _mm_add_epi8(input, bias_v128);
1694            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1695            let mask = _mm_cmpeq_epi8(gt, zero128);
1696            let offset_masked = _mm_and_si128(mask, offset_v128);
1697            let result = _mm_add_epi8(input, offset_masked);
1698            _mm_storeu_si128(dp.add(i) as *mut _, result);
1699            i += 16;
1700        }
1701
1702        // Scalar tail
1703        while i < len {
1704            let b = *sp.add(i);
1705            *dp.add(i) = if b >= lo && b <= hi {
1706                b.wrapping_add(offset as u8)
1707            } else {
1708                b
1709            };
1710            i += 1;
1711        }
1712
1713        // Fence: ensure nontemporal stores are visible before write() syscall
1714        _mm_sfence();
1715    }
1716}
1717
1718#[cfg(target_arch = "x86_64")]
1719#[target_feature(enable = "sse2")]
1720unsafe fn translate_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1721    use std::arch::x86_64::*;
1722
1723    unsafe {
1724        let range = hi - lo;
1725        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1726        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1727        let offset_v = _mm_set1_epi8(offset);
1728        let zero = _mm_setzero_si128();
1729
1730        let len = src.len();
1731        let mut i = 0;
1732
1733        while i + 16 <= len {
1734            let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
1735            let biased = _mm_add_epi8(input, bias_v);
1736            let gt = _mm_cmpgt_epi8(biased, threshold_v);
1737            let mask = _mm_cmpeq_epi8(gt, zero);
1738            let offset_masked = _mm_and_si128(mask, offset_v);
1739            let result = _mm_add_epi8(input, offset_masked);
1740            _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
1741            i += 16;
1742        }
1743
1744        while i < len {
1745            let b = *src.get_unchecked(i);
1746            *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
1747                b.wrapping_add(offset as u8)
1748            } else {
1749                b
1750            };
1751            i += 1;
1752        }
1753    }
1754}
1755
1756/// ARM64 NEON-accelerated range translation.
1757/// Processes 16 bytes per iteration using vectorized range check + conditional add.
1758#[cfg(target_arch = "aarch64")]
1759fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1760    unsafe { translate_range_neon(src, dst, lo, hi, offset) };
1761}
1762
1763#[cfg(target_arch = "aarch64")]
1764#[target_feature(enable = "neon")]
1765unsafe fn translate_range_neon(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1766    use std::arch::aarch64::*;
1767
1768    unsafe {
1769        let len = src.len();
1770        let sp = src.as_ptr();
1771        let dp = dst.as_mut_ptr();
1772        let lo_v = vdupq_n_u8(lo);
1773        let hi_v = vdupq_n_u8(hi);
1774        let offset_v = vdupq_n_s8(offset);
1775        let mut i = 0;
1776
1777        // 2x unrolled: process 32 bytes per iteration
1778        while i + 32 <= len {
1779            let in0 = vld1q_u8(sp.add(i));
1780            let in1 = vld1q_u8(sp.add(i + 16));
1781            // Range check: (b >= lo) & (b <= hi)
1782            let ge0 = vcgeq_u8(in0, lo_v);
1783            let le0 = vcleq_u8(in0, hi_v);
1784            let mask0 = vandq_u8(ge0, le0);
1785            let ge1 = vcgeq_u8(in1, lo_v);
1786            let le1 = vcleq_u8(in1, hi_v);
1787            let mask1 = vandq_u8(ge1, le1);
1788            // Conditional add: in + (offset & mask)
1789            let off0 = vandq_u8(mask0, vreinterpretq_u8_s8(offset_v));
1790            let off1 = vandq_u8(mask1, vreinterpretq_u8_s8(offset_v));
1791            let r0 = vaddq_u8(in0, off0);
1792            let r1 = vaddq_u8(in1, off1);
1793            vst1q_u8(dp.add(i), r0);
1794            vst1q_u8(dp.add(i + 16), r1);
1795            i += 32;
1796        }
1797
1798        if i + 16 <= len {
1799            let input = vld1q_u8(sp.add(i));
1800            let ge = vcgeq_u8(input, lo_v);
1801            let le = vcleq_u8(input, hi_v);
1802            let mask = vandq_u8(ge, le);
1803            let off = vandq_u8(mask, vreinterpretq_u8_s8(offset_v));
1804            vst1q_u8(dp.add(i), vaddq_u8(input, off));
1805            i += 16;
1806        }
1807
1808        while i < len {
1809            let b = *sp.add(i);
1810            *dp.add(i) = if b >= lo && b <= hi {
1811                b.wrapping_add(offset as u8)
1812            } else {
1813                b
1814            };
1815            i += 1;
1816        }
1817    }
1818}
1819
1820/// Scalar range translation fallback for non-x86_64, non-aarch64 platforms.
1821#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
1822fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1823    let offset_u8 = offset as u8;
1824    let range = hi.wrapping_sub(lo);
1825    unsafe {
1826        let sp = src.as_ptr();
1827        let dp = dst.as_mut_ptr();
1828        let len = src.len();
1829        let mut i = 0;
1830        while i + 8 <= len {
1831            macro_rules! do_byte {
1832                ($off:expr) => {{
1833                    let b = *sp.add(i + $off);
1834                    let in_range = b.wrapping_sub(lo) <= range;
1835                    *dp.add(i + $off) = if in_range {
1836                        b.wrapping_add(offset_u8)
1837                    } else {
1838                        b
1839                    };
1840                }};
1841            }
1842            do_byte!(0);
1843            do_byte!(1);
1844            do_byte!(2);
1845            do_byte!(3);
1846            do_byte!(4);
1847            do_byte!(5);
1848            do_byte!(6);
1849            do_byte!(7);
1850            i += 8;
1851        }
1852        while i < len {
1853            let b = *sp.add(i);
1854            let in_range = b.wrapping_sub(lo) <= range;
1855            *dp.add(i) = if in_range {
1856                b.wrapping_add(offset_u8)
1857            } else {
1858                b
1859            };
1860            i += 1;
1861        }
1862    }
1863}
1864
1865// ============================================================================
1866// In-place SIMD range translation (saves one buffer allocation in streaming)
1867// ============================================================================
1868
1869/// In-place SIMD-accelerated range translation.
1870/// Translates bytes in [lo..=hi] by adding `offset`, leaving others unchanged.
1871/// Operates on the buffer in-place, eliminating the need for a separate output buffer.
1872#[cfg(target_arch = "x86_64")]
1873fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1874    if get_simd_level() >= 3 {
1875        unsafe { translate_range_avx2_inplace(data, lo, hi, offset) };
1876    } else {
1877        unsafe { translate_range_sse2_inplace(data, lo, hi, offset) };
1878    }
1879}
1880
1881#[cfg(target_arch = "x86_64")]
1882#[target_feature(enable = "avx2")]
1883unsafe fn translate_range_avx2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1884    use std::arch::x86_64::*;
1885
1886    unsafe {
1887        let range = hi - lo;
1888        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1889        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1890        let offset_v = _mm256_set1_epi8(offset);
1891        let zero = _mm256_setzero_si256();
1892
1893        let len = data.len();
1894        let ptr = data.as_mut_ptr();
1895        let mut i = 0;
1896
1897        // 2x unrolled: process 64 bytes per iteration for better ILP
1898        while i + 64 <= len {
1899            let in0 = _mm256_loadu_si256(ptr.add(i) as *const _);
1900            let in1 = _mm256_loadu_si256(ptr.add(i + 32) as *const _);
1901            let bi0 = _mm256_add_epi8(in0, bias_v);
1902            let bi1 = _mm256_add_epi8(in1, bias_v);
1903            let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1904            let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1905            let m0 = _mm256_cmpeq_epi8(gt0, zero);
1906            let m1 = _mm256_cmpeq_epi8(gt1, zero);
1907            let om0 = _mm256_and_si256(m0, offset_v);
1908            let om1 = _mm256_and_si256(m1, offset_v);
1909            let r0 = _mm256_add_epi8(in0, om0);
1910            let r1 = _mm256_add_epi8(in1, om1);
1911            _mm256_storeu_si256(ptr.add(i) as *mut _, r0);
1912            _mm256_storeu_si256(ptr.add(i + 32) as *mut _, r1);
1913            i += 64;
1914        }
1915
1916        // Remaining 32-byte chunk
1917        if i + 32 <= len {
1918            let input = _mm256_loadu_si256(ptr.add(i) as *const _);
1919            let biased = _mm256_add_epi8(input, bias_v);
1920            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1921            let mask = _mm256_cmpeq_epi8(gt, zero);
1922            let offset_masked = _mm256_and_si256(mask, offset_v);
1923            let result = _mm256_add_epi8(input, offset_masked);
1924            _mm256_storeu_si256(ptr.add(i) as *mut _, result);
1925            i += 32;
1926        }
1927
1928        if i + 16 <= len {
1929            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1930            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1931            let offset_v128 = _mm_set1_epi8(offset);
1932            let zero128 = _mm_setzero_si128();
1933
1934            let input = _mm_loadu_si128(ptr.add(i) as *const _);
1935            let biased = _mm_add_epi8(input, bias_v128);
1936            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1937            let mask = _mm_cmpeq_epi8(gt, zero128);
1938            let offset_masked = _mm_and_si128(mask, offset_v128);
1939            let result = _mm_add_epi8(input, offset_masked);
1940            _mm_storeu_si128(ptr.add(i) as *mut _, result);
1941            i += 16;
1942        }
1943
1944        while i < len {
1945            let b = *ptr.add(i);
1946            *ptr.add(i) = if b >= lo && b <= hi {
1947                b.wrapping_add(offset as u8)
1948            } else {
1949                b
1950            };
1951            i += 1;
1952        }
1953    }
1954}
1955
1956#[cfg(target_arch = "x86_64")]
1957#[target_feature(enable = "sse2")]
1958unsafe fn translate_range_sse2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1959    use std::arch::x86_64::*;
1960
1961    unsafe {
1962        let range = hi - lo;
1963        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1964        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1965        let offset_v = _mm_set1_epi8(offset);
1966        let zero = _mm_setzero_si128();
1967
1968        let len = data.len();
1969        let ptr = data.as_mut_ptr();
1970        let mut i = 0;
1971
1972        while i + 16 <= len {
1973            let input = _mm_loadu_si128(ptr.add(i) as *const _);
1974            let biased = _mm_add_epi8(input, bias_v);
1975            let gt = _mm_cmpgt_epi8(biased, threshold_v);
1976            let mask = _mm_cmpeq_epi8(gt, zero);
1977            let offset_masked = _mm_and_si128(mask, offset_v);
1978            let result = _mm_add_epi8(input, offset_masked);
1979            _mm_storeu_si128(ptr.add(i) as *mut _, result);
1980            i += 16;
1981        }
1982
1983        while i < len {
1984            let b = *ptr.add(i);
1985            *ptr.add(i) = if b >= lo && b <= hi {
1986                b.wrapping_add(offset as u8)
1987            } else {
1988                b
1989            };
1990            i += 1;
1991        }
1992    }
1993}
1994
1995#[cfg(target_arch = "aarch64")]
1996fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1997    unsafe { translate_range_neon_inplace(data, lo, hi, offset) };
1998}
1999
2000#[cfg(target_arch = "aarch64")]
2001#[target_feature(enable = "neon")]
2002unsafe fn translate_range_neon_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
2003    use std::arch::aarch64::*;
2004
2005    unsafe {
2006        let len = data.len();
2007        let ptr = data.as_mut_ptr();
2008        let lo_v = vdupq_n_u8(lo);
2009        let hi_v = vdupq_n_u8(hi);
2010        let offset_v = vdupq_n_s8(offset);
2011        let mut i = 0;
2012
2013        while i + 32 <= len {
2014            let in0 = vld1q_u8(ptr.add(i));
2015            let in1 = vld1q_u8(ptr.add(i + 16));
2016            let ge0 = vcgeq_u8(in0, lo_v);
2017            let le0 = vcleq_u8(in0, hi_v);
2018            let mask0 = vandq_u8(ge0, le0);
2019            let ge1 = vcgeq_u8(in1, lo_v);
2020            let le1 = vcleq_u8(in1, hi_v);
2021            let mask1 = vandq_u8(ge1, le1);
2022            let off0 = vandq_u8(mask0, vreinterpretq_u8_s8(offset_v));
2023            let off1 = vandq_u8(mask1, vreinterpretq_u8_s8(offset_v));
2024            vst1q_u8(ptr.add(i), vaddq_u8(in0, off0));
2025            vst1q_u8(ptr.add(i + 16), vaddq_u8(in1, off1));
2026            i += 32;
2027        }
2028
2029        if i + 16 <= len {
2030            let input = vld1q_u8(ptr.add(i));
2031            let ge = vcgeq_u8(input, lo_v);
2032            let le = vcleq_u8(input, hi_v);
2033            let mask = vandq_u8(ge, le);
2034            let off = vandq_u8(mask, vreinterpretq_u8_s8(offset_v));
2035            vst1q_u8(ptr.add(i), vaddq_u8(input, off));
2036            i += 16;
2037        }
2038
2039        while i < len {
2040            let b = *ptr.add(i);
2041            if b >= lo && b <= hi {
2042                *ptr.add(i) = b.wrapping_add(offset as u8);
2043            }
2044            i += 1;
2045        }
2046    }
2047}
2048
2049#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
2050fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
2051    let offset_u8 = offset as u8;
2052    let range = hi.wrapping_sub(lo);
2053    for b in data.iter_mut() {
2054        if b.wrapping_sub(lo) <= range {
2055            *b = b.wrapping_add(offset_u8);
2056        }
2057    }
2058}
2059
2060// ============================================================================
2061// SIMD range deletion (x86_64)
2062// ============================================================================
2063
2064/// Detect if ALL delete characters form a single contiguous byte range [lo..=hi].
2065/// Returns Some((lo, hi)) if so. This is true for common classes:
2066/// - `[:digit:]` = 0x30..=0x39
2067/// - `a-z` = 0x61..=0x7A
2068/// - `A-Z` = 0x41..=0x5A
2069#[inline]
2070fn detect_delete_range(chars: &[u8]) -> Option<(u8, u8)> {
2071    if chars.is_empty() {
2072        return None;
2073    }
2074    let mut lo = chars[0];
2075    let mut hi = chars[0];
2076    for &c in &chars[1..] {
2077        if c < lo {
2078            lo = c;
2079        }
2080        if c > hi {
2081            hi = c;
2082        }
2083    }
2084    // Check that the range size matches the number of chars (no gaps)
2085    // Cast to usize before +1 to avoid u8 overflow when hi=255, lo=0 (range=256)
2086    if (hi as usize - lo as usize + 1) == chars.len() {
2087        Some((lo, hi))
2088    } else {
2089        None
2090    }
2091}
2092
2093/// SIMD-accelerated delete for contiguous byte ranges.
2094/// Uses the same bias+threshold trick as range translate to identify bytes in [lo..=hi],
2095/// then compacts output by skipping matched bytes.
2096#[cfg(target_arch = "x86_64")]
2097fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2098    if get_simd_level() >= 3 {
2099        unsafe { delete_range_avx2(src, dst, lo, hi) }
2100    } else {
2101        unsafe { delete_range_sse2(src, dst, lo, hi) }
2102    }
2103}
2104
2105#[cfg(target_arch = "x86_64")]
2106#[target_feature(enable = "avx2")]
2107unsafe fn delete_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2108    use std::arch::x86_64::*;
2109
2110    unsafe {
2111        let range = hi - lo;
2112        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2113        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
2114        let zero = _mm256_setzero_si256();
2115
2116        let len = src.len();
2117        let sp = src.as_ptr();
2118        let dp = dst.as_mut_ptr();
2119        let mut ri = 0;
2120        let mut wp = 0;
2121
2122        while ri + 32 <= len {
2123            let input = _mm256_loadu_si256(sp.add(ri) as *const _);
2124            let biased = _mm256_add_epi8(input, bias_v);
2125            // gt = 0xFF where biased > threshold (OUT of range = KEEP)
2126            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
2127            // in_range = 0xFF where IN range (to DELETE), 0 where to KEEP
2128            let in_range = _mm256_cmpeq_epi8(gt, zero);
2129            // keep_mask bits: 1 = keep (NOT in range)
2130            let keep_mask = !(_mm256_movemask_epi8(in_range) as u32);
2131
2132            if keep_mask == 0xFFFFFFFF {
2133                // All 32 bytes are kept — bulk copy
2134                std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 32);
2135                wp += 32;
2136            } else if keep_mask != 0 {
2137                // Partial keep — per-lane processing with all-keep fast paths.
2138                // For 4% delete rate, ~72% of 8-byte lanes are all-keep even
2139                // within partial 32-byte blocks. The per-lane check avoids
2140                // the LUT compact overhead for these clean lanes.
2141                let m0 = keep_mask as u8;
2142                let m1 = (keep_mask >> 8) as u8;
2143                let m2 = (keep_mask >> 16) as u8;
2144                let m3 = (keep_mask >> 24) as u8;
2145
2146                if m0 == 0xFF {
2147                    std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
2148                } else if m0 != 0 {
2149                    compact_8bytes_simd(sp.add(ri), dp.add(wp), m0);
2150                }
2151                let c0 = m0.count_ones() as usize;
2152
2153                if m1 == 0xFF {
2154                    std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
2155                } else if m1 != 0 {
2156                    compact_8bytes_simd(sp.add(ri + 8), dp.add(wp + c0), m1);
2157                }
2158                let c1 = m1.count_ones() as usize;
2159
2160                if m2 == 0xFF {
2161                    std::ptr::copy_nonoverlapping(sp.add(ri + 16), dp.add(wp + c0 + c1), 8);
2162                } else if m2 != 0 {
2163                    compact_8bytes_simd(sp.add(ri + 16), dp.add(wp + c0 + c1), m2);
2164                }
2165                let c2 = m2.count_ones() as usize;
2166
2167                if m3 == 0xFF {
2168                    std::ptr::copy_nonoverlapping(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), 8);
2169                } else if m3 != 0 {
2170                    compact_8bytes_simd(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), m3);
2171                }
2172                let c3 = m3.count_ones() as usize;
2173                wp += c0 + c1 + c2 + c3;
2174            }
2175            // else: keep_mask == 0 means all bytes deleted, skip entirely
2176            ri += 32;
2177        }
2178
2179        // SSE2 tail for 16-byte remainder
2180        if ri + 16 <= len {
2181            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2182            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
2183            let zero128 = _mm_setzero_si128();
2184
2185            let input = _mm_loadu_si128(sp.add(ri) as *const _);
2186            let biased = _mm_add_epi8(input, bias_v128);
2187            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
2188            let in_range = _mm_cmpeq_epi8(gt, zero128);
2189            let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
2190
2191            if keep_mask == 0xFFFF {
2192                std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 16);
2193                wp += 16;
2194            } else if keep_mask != 0 {
2195                let m0 = keep_mask as u8;
2196                let m1 = (keep_mask >> 8) as u8;
2197                if m0 == 0xFF {
2198                    std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
2199                } else if m0 != 0 {
2200                    compact_8bytes_simd(sp.add(ri), dp.add(wp), m0);
2201                }
2202                let c0 = m0.count_ones() as usize;
2203                if m1 == 0xFF {
2204                    std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
2205                } else if m1 != 0 {
2206                    compact_8bytes_simd(sp.add(ri + 8), dp.add(wp + c0), m1);
2207                }
2208                wp += c0 + m1.count_ones() as usize;
2209            }
2210            ri += 16;
2211        }
2212
2213        // Scalar tail — branchless: always store, advance wp only for kept bytes
2214        while ri < len {
2215            let b = *sp.add(ri);
2216            *dp.add(wp) = b;
2217            wp += (b < lo || b > hi) as usize;
2218            ri += 1;
2219        }
2220
2221        wp
2222    }
2223}
2224
2225/// Compact 8 source bytes into contiguous output bytes using a keep mask.
2226/// Each bit in `mask` indicates whether the corresponding byte should be kept.
2227/// Uses a precomputed LUT: for each 8-bit mask, the LUT stores indices of set bits.
2228/// Always performs 8 unconditional stores (extra stores past popcount are harmless
2229/// since the write pointer only advances by popcount, and subsequent lanes overwrite).
2230/// This eliminates the serial tzcnt→blsr dependency chain (~28 cycles) in favor of
2231/// independent indexed loads and stores (~15 cycles).
2232#[cfg(target_arch = "x86_64")]
2233#[inline(always)]
2234unsafe fn compact_8bytes(src: *const u8, dst: *mut u8, mask: u8) {
2235    unsafe {
2236        let idx = COMPACT_LUT.get_unchecked(mask as usize);
2237        *dst = *src.add(*idx.get_unchecked(0) as usize);
2238        *dst.add(1) = *src.add(*idx.get_unchecked(1) as usize);
2239        *dst.add(2) = *src.add(*idx.get_unchecked(2) as usize);
2240        *dst.add(3) = *src.add(*idx.get_unchecked(3) as usize);
2241        *dst.add(4) = *src.add(*idx.get_unchecked(4) as usize);
2242        *dst.add(5) = *src.add(*idx.get_unchecked(5) as usize);
2243        *dst.add(6) = *src.add(*idx.get_unchecked(6) as usize);
2244        *dst.add(7) = *src.add(*idx.get_unchecked(7) as usize);
2245    }
2246}
2247
2248/// SSSE3 pshufb-based byte compaction. Loads 8 source bytes into an XMM register,
2249/// shuffles kept bytes to the front using COMPACT_LUT + _mm_shuffle_epi8, stores 8 bytes.
2250/// ~4x faster than scalar compact_8bytes: 1 pshufb vs 8 individual indexed byte copies.
2251/// Requires SSSE3; safe to call from AVX2 functions (which imply SSSE3).
2252#[cfg(target_arch = "x86_64")]
2253#[target_feature(enable = "ssse3")]
2254#[inline]
2255unsafe fn compact_8bytes_simd(src: *const u8, dst: *mut u8, mask: u8) {
2256    use std::arch::x86_64::*;
2257    unsafe {
2258        let src_v = _mm_loadl_epi64(src as *const _);
2259        let shuf = _mm_loadl_epi64(COMPACT_LUT.get_unchecked(mask as usize).as_ptr() as *const _);
2260        let out_v = _mm_shuffle_epi8(src_v, shuf);
2261        _mm_storel_epi64(dst as *mut _, out_v);
2262    }
2263}
2264
2265#[cfg(target_arch = "x86_64")]
2266#[target_feature(enable = "sse2")]
2267unsafe fn delete_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2268    use std::arch::x86_64::*;
2269
2270    unsafe {
2271        let range = hi - lo;
2272        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2273        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
2274        let zero = _mm_setzero_si128();
2275
2276        let len = src.len();
2277        let sp = src.as_ptr();
2278        let dp = dst.as_mut_ptr();
2279        let mut ri = 0;
2280        let mut wp = 0;
2281
2282        while ri + 16 <= len {
2283            let input = _mm_loadu_si128(sp.add(ri) as *const _);
2284            let biased = _mm_add_epi8(input, bias_v);
2285            let gt = _mm_cmpgt_epi8(biased, threshold_v);
2286            let in_range = _mm_cmpeq_epi8(gt, zero);
2287            let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
2288
2289            if keep_mask == 0xFFFF {
2290                // All 16 bytes kept — bulk copy
2291                std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 16);
2292                wp += 16;
2293            } else if keep_mask != 0 {
2294                let m0 = keep_mask as u8;
2295                let m1 = (keep_mask >> 8) as u8;
2296                if m0 == 0xFF {
2297                    std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
2298                } else if m0 != 0 {
2299                    compact_8bytes(sp.add(ri), dp.add(wp), m0);
2300                }
2301                let c0 = m0.count_ones() as usize;
2302                if m1 == 0xFF {
2303                    std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
2304                } else if m1 != 0 {
2305                    compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), m1);
2306                }
2307                wp += c0 + m1.count_ones() as usize;
2308            }
2309            ri += 16;
2310        }
2311
2312        // Scalar tail — branchless
2313        while ri < len {
2314            let b = *sp.add(ri);
2315            *dp.add(wp) = b;
2316            wp += (b < lo || b > hi) as usize;
2317            ri += 1;
2318        }
2319
2320        wp
2321    }
2322}
2323
2324/// Branchless range delete fallback for non-x86_64 (ARM64, etc.).
2325/// Unconditional store + conditional pointer advance eliminates branch
2326/// mispredictions. Unrolled 8x for better ILP on out-of-order cores.
2327#[cfg(not(target_arch = "x86_64"))]
2328fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2329    let len = src.len();
2330    let sp = src.as_ptr();
2331    let dp = dst.as_mut_ptr();
2332    let mut wp: usize = 0;
2333    let mut i: usize = 0;
2334
2335    // Unrolled branchless loop — 8 bytes per iteration
2336    while i + 8 <= len {
2337        unsafe {
2338            let b0 = *sp.add(i);
2339            *dp.add(wp) = b0;
2340            wp += (b0 < lo || b0 > hi) as usize;
2341            let b1 = *sp.add(i + 1);
2342            *dp.add(wp) = b1;
2343            wp += (b1 < lo || b1 > hi) as usize;
2344            let b2 = *sp.add(i + 2);
2345            *dp.add(wp) = b2;
2346            wp += (b2 < lo || b2 > hi) as usize;
2347            let b3 = *sp.add(i + 3);
2348            *dp.add(wp) = b3;
2349            wp += (b3 < lo || b3 > hi) as usize;
2350            let b4 = *sp.add(i + 4);
2351            *dp.add(wp) = b4;
2352            wp += (b4 < lo || b4 > hi) as usize;
2353            let b5 = *sp.add(i + 5);
2354            *dp.add(wp) = b5;
2355            wp += (b5 < lo || b5 > hi) as usize;
2356            let b6 = *sp.add(i + 6);
2357            *dp.add(wp) = b6;
2358            wp += (b6 < lo || b6 > hi) as usize;
2359            let b7 = *sp.add(i + 7);
2360            *dp.add(wp) = b7;
2361            wp += (b7 < lo || b7 > hi) as usize;
2362        }
2363        i += 8;
2364    }
2365
2366    // Scalar tail
2367    while i < len {
2368        unsafe {
2369            let b = *sp.add(i);
2370            *dp.add(wp) = b;
2371            wp += (b < lo || b > hi) as usize;
2372        }
2373        i += 1;
2374    }
2375
2376    wp
2377}
2378
2379/// Streaming delete for contiguous byte ranges using SIMD range detection.
2380/// Uses 4MB buffer to reduce syscalls (delete is compute-light, I/O bound).
2381/// When no bytes are deleted from a chunk (common for data with few matches),
2382/// writes directly from the source buffer to avoid the copy overhead.
2383fn delete_range_streaming(
2384    lo: u8,
2385    hi: u8,
2386    reader: &mut impl Read,
2387    writer: &mut impl Write,
2388) -> io::Result<()> {
2389    let mut buf = alloc_uninit_vec(STREAM_BUF);
2390    loop {
2391        let n = read_once(reader, &mut buf)?;
2392        if n == 0 {
2393            break;
2394        }
2395        let wp = delete_range_inplace(&mut buf, n, lo, hi);
2396        if wp > 0 {
2397            writer.write_all(&buf[..wp])?;
2398        }
2399    }
2400    Ok(())
2401}
2402
2403/// In-place range delete: SIMD scan for all-keep blocks + branchless scalar compaction.
2404/// Uses a single buffer — reads at position ri, writes at position wp (wp <= ri always).
2405#[inline]
2406fn delete_range_inplace(buf: &mut [u8], n: usize, lo: u8, hi: u8) -> usize {
2407    #[cfg(target_arch = "x86_64")]
2408    {
2409        let level = get_simd_level();
2410        if level >= 3 {
2411            return unsafe { delete_range_inplace_avx2(buf, n, lo, hi) };
2412        }
2413    }
2414    // Scalar fallback: branchless in-place delete
2415    let ptr = buf.as_mut_ptr();
2416    let mut ri = 0;
2417    let mut wp = 0;
2418    unsafe {
2419        while ri + 8 <= n {
2420            let b0 = *ptr.add(ri);
2421            let b1 = *ptr.add(ri + 1);
2422            let b2 = *ptr.add(ri + 2);
2423            let b3 = *ptr.add(ri + 3);
2424            let b4 = *ptr.add(ri + 4);
2425            let b5 = *ptr.add(ri + 5);
2426            let b6 = *ptr.add(ri + 6);
2427            let b7 = *ptr.add(ri + 7);
2428            *ptr.add(wp) = b0;
2429            wp += (b0 < lo || b0 > hi) as usize;
2430            *ptr.add(wp) = b1;
2431            wp += (b1 < lo || b1 > hi) as usize;
2432            *ptr.add(wp) = b2;
2433            wp += (b2 < lo || b2 > hi) as usize;
2434            *ptr.add(wp) = b3;
2435            wp += (b3 < lo || b3 > hi) as usize;
2436            *ptr.add(wp) = b4;
2437            wp += (b4 < lo || b4 > hi) as usize;
2438            *ptr.add(wp) = b5;
2439            wp += (b5 < lo || b5 > hi) as usize;
2440            *ptr.add(wp) = b6;
2441            wp += (b6 < lo || b6 > hi) as usize;
2442            *ptr.add(wp) = b7;
2443            wp += (b7 < lo || b7 > hi) as usize;
2444            ri += 8;
2445        }
2446        while ri < n {
2447            let b = *ptr.add(ri);
2448            *ptr.add(wp) = b;
2449            wp += (b < lo || b > hi) as usize;
2450            ri += 1;
2451        }
2452    }
2453    wp
2454}
2455
2456/// AVX2 in-place range delete: scan 32 bytes at a time, skip all-keep blocks,
2457/// branchless scalar compaction for mixed blocks.
2458#[cfg(target_arch = "x86_64")]
2459#[target_feature(enable = "avx2")]
2460unsafe fn delete_range_inplace_avx2(buf: &mut [u8], n: usize, lo: u8, hi: u8) -> usize {
2461    use std::arch::x86_64::*;
2462
2463    unsafe {
2464        let range = hi - lo;
2465        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2466        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
2467        let zero = _mm256_setzero_si256();
2468
2469        let ptr = buf.as_mut_ptr();
2470        let mut ri = 0;
2471        let mut wp = 0;
2472
2473        while ri + 32 <= n {
2474            let input = _mm256_loadu_si256(ptr.add(ri) as *const _);
2475            let biased = _mm256_add_epi8(input, bias_v);
2476            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
2477            let in_range = _mm256_cmpeq_epi8(gt, zero);
2478            let del_mask = _mm256_movemask_epi8(in_range) as u32;
2479
2480            if del_mask == 0 {
2481                // All 32 bytes kept
2482                if wp != ri {
2483                    std::ptr::copy(ptr.add(ri), ptr.add(wp), 32);
2484                }
2485                wp += 32;
2486            } else if del_mask != 0xFFFFFFFF {
2487                // Mixed block: pshufb-based 8-byte compaction.
2488                // Process 4 × 8-byte sub-chunks using COMPACT_LUT + pshufb.
2489                // Each sub-chunk: load 8 bytes into register (safe for overlap),
2490                // shuffle kept bytes to front, store. 4 SIMD ops vs 32 scalar.
2491                let keep_mask = !del_mask;
2492                let m0 = keep_mask as u8;
2493                let m1 = (keep_mask >> 8) as u8;
2494                let m2 = (keep_mask >> 16) as u8;
2495                let m3 = (keep_mask >> 24) as u8;
2496
2497                let c0 = m0.count_ones() as usize;
2498                let c1 = m1.count_ones() as usize;
2499                let c2 = m2.count_ones() as usize;
2500                let c3 = m3.count_ones() as usize;
2501
2502                // Sub-chunk 0: bytes 0-7
2503                if m0 == 0xFF {
2504                    std::ptr::copy(ptr.add(ri), ptr.add(wp), 8);
2505                } else if m0 != 0 {
2506                    let src_v = _mm_loadl_epi64(ptr.add(ri) as *const _);
2507                    let shuf = _mm_loadl_epi64(COMPACT_LUT[m0 as usize].as_ptr() as *const _);
2508                    let out_v = _mm_shuffle_epi8(src_v, shuf);
2509                    _mm_storel_epi64(ptr.add(wp) as *mut _, out_v);
2510                }
2511
2512                // Sub-chunk 1: bytes 8-15
2513                if m1 == 0xFF {
2514                    std::ptr::copy(ptr.add(ri + 8), ptr.add(wp + c0), 8);
2515                } else if m1 != 0 {
2516                    let src_v = _mm_loadl_epi64(ptr.add(ri + 8) as *const _);
2517                    let shuf = _mm_loadl_epi64(COMPACT_LUT[m1 as usize].as_ptr() as *const _);
2518                    let out_v = _mm_shuffle_epi8(src_v, shuf);
2519                    _mm_storel_epi64(ptr.add(wp + c0) as *mut _, out_v);
2520                }
2521
2522                // Sub-chunk 2: bytes 16-23
2523                if m2 == 0xFF {
2524                    std::ptr::copy(ptr.add(ri + 16), ptr.add(wp + c0 + c1), 8);
2525                } else if m2 != 0 {
2526                    let src_v = _mm_loadl_epi64(ptr.add(ri + 16) as *const _);
2527                    let shuf = _mm_loadl_epi64(COMPACT_LUT[m2 as usize].as_ptr() as *const _);
2528                    let out_v = _mm_shuffle_epi8(src_v, shuf);
2529                    _mm_storel_epi64(ptr.add(wp + c0 + c1) as *mut _, out_v);
2530                }
2531
2532                // Sub-chunk 3: bytes 24-31
2533                if m3 == 0xFF {
2534                    std::ptr::copy(ptr.add(ri + 24), ptr.add(wp + c0 + c1 + c2), 8);
2535                } else if m3 != 0 {
2536                    let src_v = _mm_loadl_epi64(ptr.add(ri + 24) as *const _);
2537                    let shuf = _mm_loadl_epi64(COMPACT_LUT[m3 as usize].as_ptr() as *const _);
2538                    let out_v = _mm_shuffle_epi8(src_v, shuf);
2539                    _mm_storel_epi64(ptr.add(wp + c0 + c1 + c2) as *mut _, out_v);
2540                }
2541
2542                wp += c0 + c1 + c2 + c3;
2543            }
2544            // del_mask == 0xFFFFFFFF: all deleted, skip entirely
2545            ri += 32;
2546        }
2547
2548        // Scalar tail
2549        while ri < n {
2550            let b = *ptr.add(ri);
2551            *ptr.add(wp) = b;
2552            wp += (b < lo || b > hi) as usize;
2553            ri += 1;
2554        }
2555
2556        wp
2557    }
2558}
2559
2560// ============================================================================
2561// Streaming functions (Read + Write)
2562// ============================================================================
2563
2564pub fn translate(
2565    set1: &[u8],
2566    set2: &[u8],
2567    reader: &mut impl Read,
2568    writer: &mut impl Write,
2569) -> io::Result<()> {
2570    let table = build_translate_table(set1, set2);
2571
2572    // Check for identity table — pure passthrough (no transformation needed)
2573    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
2574    if is_identity {
2575        return passthrough_stream(reader, writer);
2576    }
2577
2578    // Try SIMD fast path for constant-offset range translations (in-place, single buffer)
2579    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
2580        return translate_range_stream(lo, hi, offset, reader, writer);
2581    }
2582
2583    // Try SIMD fast path for range-to-constant translations (e.g., '\000-\037' -> 'X').
2584    // Uses blendv (5 SIMD ops/32 bytes) instead of nibble decomposition (48 ops/32 bytes).
2585    if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
2586        return translate_range_to_constant_stream(lo, hi, replacement, reader, writer);
2587    }
2588
2589    // General case: IN-PLACE translation on a SINGLE buffer.
2590    // Process each read chunk immediately for pipelining: while ftr translates
2591    // and writes chunk N, cat writes chunk N+1 to the pipe.
2592    // SAFETY: all bytes are written by read_once before being translated.
2593    let mut buf = alloc_uninit_vec(STREAM_BUF);
2594    loop {
2595        let n = read_once(reader, &mut buf)?;
2596        if n == 0 {
2597            break;
2598        }
2599        translate_and_write_table(&mut buf, n, &table, writer)?;
2600    }
2601    Ok(())
2602}
2603
2604#[inline]
2605fn translate_and_write_table(
2606    buf: &mut [u8],
2607    total: usize,
2608    table: &[u8; 256],
2609    writer: &mut impl Write,
2610) -> io::Result<()> {
2611    if total >= PARALLEL_THRESHOLD {
2612        let nt = rayon::current_num_threads().max(1);
2613        let cs = (total / nt).max(32 * 1024);
2614        buf[..total].par_chunks_mut(cs).for_each(|chunk| {
2615            translate_inplace(chunk, table);
2616        });
2617    } else {
2618        translate_inplace(&mut buf[..total], table);
2619    }
2620    writer.write_all(&buf[..total])
2621}
2622
2623/// Streaming SIMD range translation — single buffer, in-place transform.
2624/// Processes each read chunk immediately for pipelining: while ftr translates
2625/// and writes chunk N, upstream cat writes chunk N+1 to the pipe.
2626/// For chunks >= PARALLEL_THRESHOLD, uses rayon par_chunks_mut for multi-core.
2627fn translate_range_stream(
2628    lo: u8,
2629    hi: u8,
2630    offset: i8,
2631    reader: &mut impl Read,
2632    writer: &mut impl Write,
2633) -> io::Result<()> {
2634    let mut buf = alloc_uninit_vec(STREAM_BUF);
2635    loop {
2636        let n = read_once(reader, &mut buf)?;
2637        if n == 0 {
2638            break;
2639        }
2640        translate_and_write_range(&mut buf, n, lo, hi, offset, writer)?;
2641    }
2642    Ok(())
2643}
2644
2645#[inline]
2646fn translate_and_write_range(
2647    buf: &mut [u8],
2648    total: usize,
2649    lo: u8,
2650    hi: u8,
2651    offset: i8,
2652    writer: &mut impl Write,
2653) -> io::Result<()> {
2654    if total >= PARALLEL_THRESHOLD {
2655        let nt = rayon::current_num_threads().max(1);
2656        let cs = (total / nt).max(32 * 1024);
2657        buf[..total].par_chunks_mut(cs).for_each(|chunk| {
2658            translate_range_simd_inplace(chunk, lo, hi, offset);
2659        });
2660    } else {
2661        translate_range_simd_inplace(&mut buf[..total], lo, hi, offset);
2662    }
2663    writer.write_all(&buf[..total])
2664}
2665
2666/// Streaming SIMD range-to-constant translation — single buffer, in-place transform.
2667/// Processes each read chunk immediately for pipelining with upstream cat.
2668/// Uses blendv instead of nibble decomposition for ~10x fewer SIMD ops per vector.
2669fn translate_range_to_constant_stream(
2670    lo: u8,
2671    hi: u8,
2672    replacement: u8,
2673    reader: &mut impl Read,
2674    writer: &mut impl Write,
2675) -> io::Result<()> {
2676    let mut buf = alloc_uninit_vec(STREAM_BUF);
2677    loop {
2678        let n = read_once(reader, &mut buf)?;
2679        if n == 0 {
2680            break;
2681        }
2682        translate_and_write_range_const(&mut buf, n, lo, hi, replacement, writer)?;
2683    }
2684    Ok(())
2685}
2686
2687#[inline]
2688fn translate_and_write_range_const(
2689    buf: &mut [u8],
2690    total: usize,
2691    lo: u8,
2692    hi: u8,
2693    replacement: u8,
2694    writer: &mut impl Write,
2695) -> io::Result<()> {
2696    if total >= PARALLEL_THRESHOLD {
2697        let nt = rayon::current_num_threads().max(1);
2698        let cs = (total / nt).max(32 * 1024);
2699        buf[..total].par_chunks_mut(cs).for_each(|chunk| {
2700            translate_range_to_constant_simd_inplace(chunk, lo, hi, replacement);
2701        });
2702    } else {
2703        translate_range_to_constant_simd_inplace(&mut buf[..total], lo, hi, replacement);
2704    }
2705    writer.write_all(&buf[..total])
2706}
2707
2708/// Pure passthrough: copy stdin to stdout without transformation.
2709/// Uses a single 16MB uninit buffer with direct read/write, no processing overhead.
2710fn passthrough_stream(reader: &mut impl Read, writer: &mut impl Write) -> io::Result<()> {
2711    let mut buf = alloc_uninit_vec(STREAM_BUF);
2712    loop {
2713        let n = read_once(reader, &mut buf)?;
2714        if n == 0 {
2715            break;
2716        }
2717        writer.write_all(&buf[..n])?;
2718    }
2719    Ok(())
2720}
2721
2722/// Single-read for pipelining: process data immediately after first read()
2723/// instead of blocking to fill the entire buffer. This enables cat|ftr
2724/// pipelining: while ftr processes the first chunk, cat continues writing
2725/// to the pipe. For 10MB piped input with 8MB pipe buffer, this saves
2726/// ~0.5-1ms by overlapping cat's final writes with ftr's processing.
2727#[inline]
2728fn read_once(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
2729    loop {
2730        match reader.read(buf) {
2731            Ok(n) => return Ok(n),
2732            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
2733            Err(e) => return Err(e),
2734        }
2735    }
2736}
2737
2738pub fn translate_squeeze(
2739    set1: &[u8],
2740    set2: &[u8],
2741    reader: &mut impl Read,
2742    writer: &mut impl Write,
2743) -> io::Result<()> {
2744    let table = build_translate_table(set1, set2);
2745    let squeeze_set = build_member_set(set2);
2746
2747    // For single-char squeeze set with range-to-constant translation, use
2748    // fused approach: translate via SIMD, then use memmem to find squeeze points.
2749    if set2.len() == 1 || (set2.len() > 1 && set2.iter().all(|&b| b == set2[0])) {
2750        let squeeze_ch = set2.last().copied().unwrap_or(0);
2751        return translate_squeeze_single_ch(&table, squeeze_ch, &squeeze_set, reader, writer);
2752    }
2753
2754    // Two-pass optimization for range translations:
2755    // Pass 1: SIMD range translate in-place (10x faster than scalar table lookup)
2756    // Pass 2: scalar squeeze (inherently sequential due to state dependency)
2757    let range_info = detect_range_offset(&table);
2758    let range_const_info = if range_info.is_none() {
2759        detect_range_to_constant(&table)
2760    } else {
2761        None
2762    };
2763
2764    let mut buf = alloc_uninit_vec(STREAM_BUF);
2765    let mut last_squeezed: u16 = 256;
2766
2767    loop {
2768        let n = read_once(reader, &mut buf)?;
2769        if n == 0 {
2770            break;
2771        }
2772        let wp = translate_squeeze_process(
2773            &mut buf,
2774            n,
2775            &table,
2776            &squeeze_set,
2777            range_info,
2778            range_const_info,
2779            &mut last_squeezed,
2780        );
2781        if wp > 0 {
2782            writer.write_all(&buf[..wp])?;
2783        }
2784    }
2785    Ok(())
2786}
2787
2788#[inline]
2789fn translate_squeeze_process(
2790    buf: &mut [u8],
2791    n: usize,
2792    table: &[u8; 256],
2793    squeeze_set: &[u8; 32],
2794    range_info: Option<(u8, u8, i8)>,
2795    range_const_info: Option<(u8, u8, u8)>,
2796    last_squeezed: &mut u16,
2797) -> usize {
2798    // Pass 1: translate
2799    if let Some((lo, hi, offset)) = range_info {
2800        translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
2801    } else if let Some((lo, hi, replacement)) = range_const_info {
2802        translate_range_to_constant_simd_inplace(&mut buf[..n], lo, hi, replacement);
2803    } else {
2804        translate_inplace(&mut buf[..n], table);
2805    }
2806    // Pass 2: squeeze in-place
2807    let mut wp = 0;
2808    unsafe {
2809        let ptr = buf.as_mut_ptr();
2810        let mut i = 0;
2811        while i + 8 <= n {
2812            macro_rules! squeeze_byte {
2813                ($off:expr) => {
2814                    let b = *ptr.add(i + $off);
2815                    if is_member(squeeze_set, b) {
2816                        if *last_squeezed != b as u16 {
2817                            *last_squeezed = b as u16;
2818                            *ptr.add(wp) = b;
2819                            wp += 1;
2820                        }
2821                    } else {
2822                        *last_squeezed = 256;
2823                        *ptr.add(wp) = b;
2824                        wp += 1;
2825                    }
2826                };
2827            }
2828            squeeze_byte!(0);
2829            squeeze_byte!(1);
2830            squeeze_byte!(2);
2831            squeeze_byte!(3);
2832            squeeze_byte!(4);
2833            squeeze_byte!(5);
2834            squeeze_byte!(6);
2835            squeeze_byte!(7);
2836            i += 8;
2837        }
2838        while i < n {
2839            let b = *ptr.add(i);
2840            if is_member(squeeze_set, b) {
2841                if *last_squeezed == b as u16 {
2842                    i += 1;
2843                    continue;
2844                }
2845                *last_squeezed = b as u16;
2846            } else {
2847                *last_squeezed = 256;
2848            }
2849            *ptr.add(wp) = b;
2850            wp += 1;
2851            i += 1;
2852        }
2853    }
2854    wp
2855}
2856
2857/// Optimized translate+squeeze for single squeeze character.
2858/// After SIMD translation, uses memmem to find consecutive pairs
2859/// and compacts in-place with a single write_all per chunk.
2860fn translate_squeeze_single_ch(
2861    table: &[u8; 256],
2862    squeeze_ch: u8,
2863    _squeeze_set: &[u8; 32],
2864    reader: &mut impl Read,
2865    writer: &mut impl Write,
2866) -> io::Result<()> {
2867    let range_info = detect_range_offset(table);
2868    let range_const_info = if range_info.is_none() {
2869        detect_range_to_constant(table)
2870    } else {
2871        None
2872    };
2873
2874    let pair = [squeeze_ch, squeeze_ch];
2875    let finder = memchr::memmem::Finder::new(&pair);
2876    let mut buf = alloc_uninit_vec(STREAM_BUF);
2877    let mut was_squeeze_char = false;
2878
2879    loop {
2880        let n = read_once(reader, &mut buf)?;
2881        if n == 0 {
2882            break;
2883        }
2884        let wp = translate_squeeze_single_process(
2885            &mut buf,
2886            n,
2887            table,
2888            squeeze_ch,
2889            &finder,
2890            range_info,
2891            range_const_info,
2892            &mut was_squeeze_char,
2893        );
2894        if wp > 0 {
2895            writer.write_all(&buf[..wp])?;
2896        }
2897    }
2898    Ok(())
2899}
2900
2901#[inline]
2902fn translate_squeeze_single_process(
2903    buf: &mut [u8],
2904    n: usize,
2905    table: &[u8; 256],
2906    squeeze_ch: u8,
2907    finder: &memchr::memmem::Finder<'_>,
2908    range_info: Option<(u8, u8, i8)>,
2909    range_const_info: Option<(u8, u8, u8)>,
2910    was_squeeze_char: &mut bool,
2911) -> usize {
2912    // Pass 1: translate in-place
2913    if let Some((lo, hi, offset)) = range_info {
2914        translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
2915    } else if let Some((lo, hi, replacement)) = range_const_info {
2916        translate_range_to_constant_simd_inplace(&mut buf[..n], lo, hi, replacement);
2917    } else {
2918        translate_inplace(&mut buf[..n], table);
2919    }
2920
2921    // Pass 2: squeeze compaction
2922    let mut i = 0;
2923    if *was_squeeze_char {
2924        while i < n && unsafe { *buf.as_ptr().add(i) } == squeeze_ch {
2925            i += 1;
2926        }
2927        *was_squeeze_char = false;
2928        if i >= n {
2929            *was_squeeze_char = true;
2930            return 0;
2931        }
2932    }
2933
2934    let ptr = buf.as_mut_ptr();
2935    let mut wp = 0usize;
2936
2937    loop {
2938        match finder.find(&buf[i..n]) {
2939            Some(offset) => {
2940                let seg_end = i + offset + 1;
2941                let gap = seg_end - i;
2942                if gap > 0 {
2943                    if wp != i {
2944                        unsafe {
2945                            std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), gap);
2946                        }
2947                    }
2948                    wp += gap;
2949                }
2950                i = seg_end;
2951                while i < n && unsafe { *buf.as_ptr().add(i) } == squeeze_ch {
2952                    i += 1;
2953                }
2954                if i >= n {
2955                    *was_squeeze_char = true;
2956                    break;
2957                }
2958            }
2959            None => {
2960                let rem = n - i;
2961                if rem > 0 {
2962                    if wp != i {
2963                        unsafe {
2964                            std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), rem);
2965                        }
2966                    }
2967                    wp += rem;
2968                }
2969                *was_squeeze_char = n > 0 && unsafe { *buf.as_ptr().add(n - 1) } == squeeze_ch;
2970                break;
2971            }
2972        }
2973    }
2974    wp
2975}
2976
2977pub fn delete(
2978    delete_chars: &[u8],
2979    reader: &mut impl Read,
2980    writer: &mut impl Write,
2981) -> io::Result<()> {
2982    if delete_chars.len() == 1 {
2983        return delete_single_streaming(delete_chars[0], reader, writer);
2984    }
2985    if delete_chars.len() <= 3 {
2986        return delete_multi_streaming(delete_chars, reader, writer);
2987    }
2988
2989    // SIMD fast path: if all delete chars form a contiguous range [lo..=hi],
2990    // use vectorized range comparison instead of scalar bitset lookup.
2991    // This covers [:digit:] (0x30-0x39), a-z, A-Z, etc.
2992    if let Some((lo, hi)) = detect_delete_range(delete_chars) {
2993        return delete_range_streaming(lo, hi, reader, writer);
2994    }
2995
2996    let member = build_member_set(delete_chars);
2997    let mut buf = alloc_uninit_vec(STREAM_BUF);
2998    // Separate output buffer for SIMD compaction — keeps source data intact
2999    // while compact_8bytes_simd writes to a different location.
3000    let mut outbuf = alloc_uninit_vec(STREAM_BUF);
3001
3002    loop {
3003        let n = read_once(reader, &mut buf)?;
3004        if n == 0 {
3005            break;
3006        }
3007        let wp = delete_bitset_dispatch(&buf[..n], &mut outbuf, &member);
3008        if wp > 0 {
3009            writer.write_all(&outbuf[..wp])?;
3010        }
3011    }
3012    Ok(())
3013}
3014
3015#[inline]
3016fn delete_bitset_dispatch(src: &[u8], dst: &mut [u8], member: &[u8; 32]) -> usize {
3017    #[cfg(target_arch = "x86_64")]
3018    {
3019        if get_simd_level() >= 3 {
3020            return unsafe { delete_bitset_avx2_stream(src, dst, member) };
3021        }
3022    }
3023    delete_bitset_scalar(src, dst, member)
3024}
3025
3026/// Scalar bitset delete: write kept bytes to output buffer.
3027#[inline]
3028fn delete_bitset_scalar(src: &[u8], dst: &mut [u8], member: &[u8; 32]) -> usize {
3029    let n = src.len();
3030    let mut wp = 0;
3031    unsafe {
3032        let sp = src.as_ptr();
3033        let dp = dst.as_mut_ptr();
3034        let mut i = 0;
3035        while i + 8 <= n {
3036            let b0 = *sp.add(i);
3037            let b1 = *sp.add(i + 1);
3038            let b2 = *sp.add(i + 2);
3039            let b3 = *sp.add(i + 3);
3040            let b4 = *sp.add(i + 4);
3041            let b5 = *sp.add(i + 5);
3042            let b6 = *sp.add(i + 6);
3043            let b7 = *sp.add(i + 7);
3044            *dp.add(wp) = b0;
3045            wp += !is_member(member, b0) as usize;
3046            *dp.add(wp) = b1;
3047            wp += !is_member(member, b1) as usize;
3048            *dp.add(wp) = b2;
3049            wp += !is_member(member, b2) as usize;
3050            *dp.add(wp) = b3;
3051            wp += !is_member(member, b3) as usize;
3052            *dp.add(wp) = b4;
3053            wp += !is_member(member, b4) as usize;
3054            *dp.add(wp) = b5;
3055            wp += !is_member(member, b5) as usize;
3056            *dp.add(wp) = b6;
3057            wp += !is_member(member, b6) as usize;
3058            *dp.add(wp) = b7;
3059            wp += !is_member(member, b7) as usize;
3060            i += 8;
3061        }
3062        while i < n {
3063            let b = *sp.add(i);
3064            *dp.add(wp) = b;
3065            wp += !is_member(member, b) as usize;
3066            i += 1;
3067        }
3068    }
3069    wp
3070}
3071
3072/// AVX2 bitset delete for streaming: uses SIMD to check 32 bytes against the
3073/// membership bitset at once, then compact_8bytes_simd to pack kept bytes.
3074#[cfg(target_arch = "x86_64")]
3075#[target_feature(enable = "avx2")]
3076unsafe fn delete_bitset_avx2_stream(src: &[u8], dst: &mut [u8], member: &[u8; 32]) -> usize {
3077    use std::arch::x86_64::*;
3078
3079    unsafe {
3080        let n = src.len();
3081        let sp = src.as_ptr();
3082        let dp = dst.as_mut_ptr();
3083        let mut ri = 0;
3084        let mut wp = 0;
3085
3086        // Load the 256-bit membership bitset into an AVX2 register.
3087        // Byte i of member_v has bits set for characters in [i*8..i*8+7].
3088        let member_v = _mm256_loadu_si256(member.as_ptr() as *const _);
3089
3090        // For each input byte B, we check: member[B >> 3] & (1 << (B & 7))
3091        // Using SIMD: extract byte index (B >> 3) and bit position (B & 7).
3092        let mask7 = _mm256_set1_epi8(7);
3093        let mask_0x1f = _mm256_set1_epi8(0x1F_u8 as i8);
3094
3095        // Lookup table for (1 << (x & 7)) — pshufb gives per-byte shift
3096        // that _mm256_sllv_epi32 can't do (it works on 32-bit lanes).
3097        let bit_table = _mm256_setr_epi8(
3098            1, 2, 4, 8, 16, 32, 64, -128i8, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 4, 8, 16, 32, 64, -128i8,
3099            0, 0, 0, 0, 0, 0, 0, 0,
3100        );
3101
3102        while ri + 32 <= n {
3103            let input = _mm256_loadu_si256(sp.add(ri) as *const _);
3104
3105            // byte_idx = input >> 3 (which byte of the 32-byte member set)
3106            let byte_idx = _mm256_and_si256(_mm256_srli_epi16(input, 3), mask_0x1f);
3107            // bit_pos = input & 7 (which bit within that byte)
3108            let bit_pos = _mm256_and_si256(input, mask7);
3109            // bit_mask = 1 << bit_pos (per-byte via shuffle lookup)
3110            let bit_mask = _mm256_shuffle_epi8(bit_table, bit_pos);
3111
3112            // member_byte = shuffle member_v by byte_idx (pshufb)
3113            // But pshufb only works within 128-bit lanes. We need cross-lane.
3114            // Since member is 32 bytes and byte_idx can be 0-31, we need
3115            // a different approach. Use two pshufb + blend:
3116            // lo_half = pshufb(member[0..15], byte_idx)
3117            // hi_half = pshufb(member[16..31], byte_idx - 16)
3118            // select = byte_idx >= 16
3119            let member_lo = _mm256_broadcastsi128_si256(_mm256_castsi256_si128(member_v));
3120            let member_hi = _mm256_broadcastsi128_si256(_mm256_extracti128_si256(member_v, 1));
3121            let lo_mask = _mm256_set1_epi8(0x0F);
3122            let idx_lo = _mm256_and_si256(byte_idx, lo_mask);
3123            let shuffled_lo = _mm256_shuffle_epi8(member_lo, idx_lo);
3124            let shuffled_hi = _mm256_shuffle_epi8(member_hi, idx_lo);
3125            // select hi when byte_idx >= 16 (bit 4 set)
3126            let use_hi = _mm256_slli_epi16(byte_idx, 3); // shift bit 4 to bit 7
3127            let member_byte = _mm256_blendv_epi8(shuffled_lo, shuffled_hi, use_hi);
3128
3129            // Check: (member_byte & bit_mask) != 0 → byte is in delete set
3130            let test = _mm256_and_si256(member_byte, bit_mask);
3131            let is_zero = _mm256_cmpeq_epi8(test, _mm256_setzero_si256());
3132            // keep_mask: bit set = byte should be KEPT (not in delete set)
3133            let keep_mask = _mm256_movemask_epi8(is_zero) as u32;
3134
3135            if keep_mask == 0xFFFFFFFF {
3136                // All 32 bytes kept — bulk copy
3137                std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 32);
3138                wp += 32;
3139            } else if keep_mask != 0 {
3140                // Partial keep — compact 8 bytes at a time
3141                let m0 = keep_mask as u8;
3142                let m1 = (keep_mask >> 8) as u8;
3143                let m2 = (keep_mask >> 16) as u8;
3144                let m3 = (keep_mask >> 24) as u8;
3145
3146                if m0 == 0xFF {
3147                    std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
3148                } else if m0 != 0 {
3149                    compact_8bytes_simd(sp.add(ri), dp.add(wp), m0);
3150                }
3151                let c0 = m0.count_ones() as usize;
3152
3153                if m1 == 0xFF {
3154                    std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
3155                } else if m1 != 0 {
3156                    compact_8bytes_simd(sp.add(ri + 8), dp.add(wp + c0), m1);
3157                }
3158                let c1 = m1.count_ones() as usize;
3159
3160                if m2 == 0xFF {
3161                    std::ptr::copy_nonoverlapping(sp.add(ri + 16), dp.add(wp + c0 + c1), 8);
3162                } else if m2 != 0 {
3163                    compact_8bytes_simd(sp.add(ri + 16), dp.add(wp + c0 + c1), m2);
3164                }
3165                let c2 = m2.count_ones() as usize;
3166
3167                if m3 == 0xFF {
3168                    std::ptr::copy_nonoverlapping(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), 8);
3169                } else if m3 != 0 {
3170                    compact_8bytes_simd(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), m3);
3171                }
3172                let c3 = m3.count_ones() as usize;
3173                wp += c0 + c1 + c2 + c3;
3174            }
3175            // else: all 32 bytes deleted, wp unchanged
3176            ri += 32;
3177        }
3178
3179        // Scalar tail
3180        while ri < n {
3181            let b = *sp.add(ri);
3182            *dp.add(wp) = b;
3183            wp += !is_member(member, b) as usize;
3184            ri += 1;
3185        }
3186
3187        wp
3188    }
3189}
3190
3191fn delete_single_streaming(
3192    ch: u8,
3193    reader: &mut impl Read,
3194    writer: &mut impl Write,
3195) -> io::Result<()> {
3196    let mut buf = alloc_uninit_vec(STREAM_BUF);
3197    loop {
3198        let n = read_once(reader, &mut buf)?;
3199        if n == 0 {
3200            break;
3201        }
3202        let wp = delete_single_inplace(&mut buf, n, ch);
3203        if wp > 0 {
3204            writer.write_all(&buf[..wp])?;
3205        }
3206    }
3207    Ok(())
3208}
3209
3210/// In-place single-char delete using memchr gap-copy.
3211#[inline]
3212fn delete_single_inplace(buf: &mut [u8], n: usize, ch: u8) -> usize {
3213    let mut wp = 0;
3214    let mut i = 0;
3215    while i < n {
3216        match memchr::memchr(ch, &buf[i..n]) {
3217            Some(offset) => {
3218                if offset > 0 {
3219                    if wp != i {
3220                        unsafe {
3221                            std::ptr::copy(buf.as_ptr().add(i), buf.as_mut_ptr().add(wp), offset);
3222                        }
3223                    }
3224                    wp += offset;
3225                }
3226                i += offset + 1;
3227            }
3228            None => {
3229                let run_len = n - i;
3230                if run_len > 0 {
3231                    if wp != i {
3232                        unsafe {
3233                            std::ptr::copy(buf.as_ptr().add(i), buf.as_mut_ptr().add(wp), run_len);
3234                        }
3235                    }
3236                    wp += run_len;
3237                }
3238                break;
3239            }
3240        }
3241    }
3242    wp
3243}
3244
3245fn delete_multi_streaming(
3246    chars: &[u8],
3247    reader: &mut impl Read,
3248    writer: &mut impl Write,
3249) -> io::Result<()> {
3250    let mut buf = alloc_uninit_vec(STREAM_BUF);
3251    loop {
3252        let n = read_once(reader, &mut buf)?;
3253        if n == 0 {
3254            break;
3255        }
3256        let wp = delete_multi_inplace(&mut buf, n, chars);
3257        if wp > 0 {
3258            writer.write_all(&buf[..wp])?;
3259        }
3260    }
3261    Ok(())
3262}
3263
3264/// In-place multi-char delete using memchr2/memchr3 gap-copy.
3265#[inline]
3266fn delete_multi_inplace(buf: &mut [u8], n: usize, chars: &[u8]) -> usize {
3267    let mut wp = 0;
3268    let mut i = 0;
3269    while i < n {
3270        let found = if chars.len() == 2 {
3271            memchr::memchr2(chars[0], chars[1], &buf[i..n])
3272        } else {
3273            memchr::memchr3(chars[0], chars[1], chars[2], &buf[i..n])
3274        };
3275        match found {
3276            Some(offset) => {
3277                if offset > 0 {
3278                    if wp != i {
3279                        unsafe {
3280                            std::ptr::copy(buf.as_ptr().add(i), buf.as_mut_ptr().add(wp), offset);
3281                        }
3282                    }
3283                    wp += offset;
3284                }
3285                i += offset + 1;
3286            }
3287            None => {
3288                let run_len = n - i;
3289                if run_len > 0 {
3290                    if wp != i {
3291                        unsafe {
3292                            std::ptr::copy(buf.as_ptr().add(i), buf.as_mut_ptr().add(wp), run_len);
3293                        }
3294                    }
3295                    wp += run_len;
3296                }
3297                break;
3298            }
3299        }
3300    }
3301    wp
3302}
3303
3304pub fn delete_squeeze(
3305    delete_chars: &[u8],
3306    squeeze_chars: &[u8],
3307    reader: &mut impl Read,
3308    writer: &mut impl Write,
3309) -> io::Result<()> {
3310    let delete_set = build_member_set(delete_chars);
3311    let squeeze_set = build_member_set(squeeze_chars);
3312    let mut buf = alloc_uninit_vec(STREAM_BUF);
3313    let mut last_squeezed: u16 = 256;
3314
3315    loop {
3316        let n = read_once(reader, &mut buf)?;
3317        if n == 0 {
3318            break;
3319        }
3320        let wp = delete_squeeze_inplace(&mut buf, n, &delete_set, &squeeze_set, &mut last_squeezed);
3321        if wp > 0 {
3322            writer.write_all(&buf[..wp])?;
3323        }
3324    }
3325    Ok(())
3326}
3327
3328#[inline]
3329fn delete_squeeze_inplace(
3330    buf: &mut [u8],
3331    n: usize,
3332    delete_set: &[u8; 32],
3333    squeeze_set: &[u8; 32],
3334    last_squeezed: &mut u16,
3335) -> usize {
3336    let mut wp = 0;
3337    unsafe {
3338        let ptr = buf.as_mut_ptr();
3339        let mut i = 0;
3340        while i + 8 <= n {
3341            macro_rules! process_byte {
3342                ($off:expr) => {
3343                    let b = *ptr.add(i + $off);
3344                    if !is_member(delete_set, b) {
3345                        if is_member(squeeze_set, b) {
3346                            if *last_squeezed != b as u16 {
3347                                *last_squeezed = b as u16;
3348                                *ptr.add(wp) = b;
3349                                wp += 1;
3350                            }
3351                        } else {
3352                            *last_squeezed = 256;
3353                            *ptr.add(wp) = b;
3354                            wp += 1;
3355                        }
3356                    }
3357                };
3358            }
3359            process_byte!(0);
3360            process_byte!(1);
3361            process_byte!(2);
3362            process_byte!(3);
3363            process_byte!(4);
3364            process_byte!(5);
3365            process_byte!(6);
3366            process_byte!(7);
3367            i += 8;
3368        }
3369        while i < n {
3370            let b = *ptr.add(i);
3371            if !is_member(delete_set, b) {
3372                if is_member(squeeze_set, b) {
3373                    if *last_squeezed != b as u16 {
3374                        *last_squeezed = b as u16;
3375                        *ptr.add(wp) = b;
3376                        wp += 1;
3377                    }
3378                } else {
3379                    *last_squeezed = 256;
3380                    *ptr.add(wp) = b;
3381                    wp += 1;
3382                }
3383            }
3384            i += 1;
3385        }
3386    }
3387    wp
3388}
3389
3390pub fn squeeze(
3391    squeeze_chars: &[u8],
3392    reader: &mut impl Read,
3393    writer: &mut impl Write,
3394) -> io::Result<()> {
3395    if squeeze_chars.len() == 1 {
3396        return squeeze_single_stream(squeeze_chars[0], reader, writer);
3397    }
3398
3399    // For 2-3 squeeze chars, use memchr2/memchr3-based gap-copy
3400    // which gives SIMD-accelerated scanning instead of byte-at-a-time.
3401    if squeeze_chars.len() <= 3 {
3402        return squeeze_multi_stream(squeeze_chars, reader, writer);
3403    }
3404
3405    let member = build_member_set(squeeze_chars);
3406    let mut buf = alloc_uninit_vec(STREAM_BUF);
3407    let mut last_squeezed: u16 = 256;
3408
3409    loop {
3410        let n = read_once(reader, &mut buf)?;
3411        if n == 0 {
3412            break;
3413        }
3414        let wp = squeeze_inplace_bitset(&mut buf, n, &member, &mut last_squeezed);
3415        if wp > 0 {
3416            writer.write_all(&buf[..wp])?;
3417        }
3418    }
3419    Ok(())
3420}
3421
3422#[inline]
3423fn squeeze_inplace_bitset(
3424    buf: &mut [u8],
3425    n: usize,
3426    member: &[u8; 32],
3427    last_squeezed: &mut u16,
3428) -> usize {
3429    let mut wp = 0;
3430    unsafe {
3431        let ptr = buf.as_mut_ptr();
3432        for i in 0..n {
3433            let b = *ptr.add(i);
3434            if is_member(member, b) {
3435                if *last_squeezed == b as u16 {
3436                    continue;
3437                }
3438                *last_squeezed = b as u16;
3439            } else {
3440                *last_squeezed = 256;
3441            }
3442            *ptr.add(wp) = b;
3443            wp += 1;
3444        }
3445    }
3446    wp
3447}
3448
3449/// Streaming squeeze for 2-3 chars using memchr2/memchr3 SIMD scanning.
3450/// Builds writev IoSlice entries pointing into the read buffer, skipping
3451/// duplicate runs of squeezable characters. Zero-copy between squeeze points.
3452fn squeeze_multi_stream(
3453    chars: &[u8],
3454    reader: &mut impl Read,
3455    writer: &mut impl Write,
3456) -> io::Result<()> {
3457    let c0 = chars[0];
3458    let c1 = chars[1];
3459    let c2 = if chars.len() >= 3 {
3460        Some(chars[2])
3461    } else {
3462        None
3463    };
3464
3465    let mut buf = alloc_uninit_vec(STREAM_BUF);
3466    let mut last_squeezed: u16 = 256;
3467
3468    loop {
3469        let n = read_once(reader, &mut buf)?;
3470        if n == 0 {
3471            break;
3472        }
3473        let wp = squeeze_multi_compact(&mut buf, n, c0, c1, c2, &mut last_squeezed);
3474        if wp > 0 {
3475            writer.write_all(&buf[..wp])?;
3476        }
3477    }
3478    Ok(())
3479}
3480
3481/// In-place multi-char squeeze using memchr2/memchr3 gap-copy.
3482#[inline]
3483fn squeeze_multi_compact(
3484    buf: &mut [u8],
3485    n: usize,
3486    c0: u8,
3487    c1: u8,
3488    c2: Option<u8>,
3489    last_squeezed: &mut u16,
3490) -> usize {
3491    let ptr = buf.as_mut_ptr();
3492    let mut wp = 0usize;
3493    let mut cursor = 0usize;
3494
3495    while cursor < n {
3496        let found = if let Some(c) = c2 {
3497            memchr::memchr3(c0, c1, c, &buf[cursor..n])
3498        } else {
3499            memchr::memchr2(c0, c1, &buf[cursor..n])
3500        };
3501        match found {
3502            Some(offset) => {
3503                let pos = cursor + offset;
3504                let b = unsafe { *ptr.add(pos) };
3505
3506                let gap = pos - cursor;
3507                if gap > 0 {
3508                    if wp != cursor {
3509                        unsafe {
3510                            std::ptr::copy(ptr.add(cursor), ptr.add(wp), gap);
3511                        }
3512                    }
3513                    wp += gap;
3514                    *last_squeezed = 256;
3515                }
3516
3517                if *last_squeezed != b as u16 {
3518                    unsafe { *ptr.add(wp) = b };
3519                    wp += 1;
3520                    *last_squeezed = b as u16;
3521                }
3522
3523                cursor = pos + 1;
3524                while cursor < n && unsafe { *ptr.add(cursor) } == b {
3525                    cursor += 1;
3526                }
3527            }
3528            None => {
3529                let rem = n - cursor;
3530                if rem > 0 {
3531                    if wp != cursor {
3532                        unsafe {
3533                            std::ptr::copy(ptr.add(cursor), ptr.add(wp), rem);
3534                        }
3535                    }
3536                    wp += rem;
3537                    *last_squeezed = 256;
3538                }
3539                break;
3540            }
3541        }
3542    }
3543    wp
3544}
3545
3546fn squeeze_single_stream(
3547    ch: u8,
3548    reader: &mut impl Read,
3549    writer: &mut impl Write,
3550) -> io::Result<()> {
3551    let mut buf = alloc_uninit_vec(STREAM_BUF);
3552    let mut was_squeeze_char = false;
3553
3554    // AVX2 path: process 32 bytes at a time with SIMD compaction
3555    #[cfg(target_arch = "x86_64")]
3556    if get_simd_level() >= 3 {
3557        loop {
3558            let n = read_once(reader, &mut buf)?;
3559            if n == 0 {
3560                break;
3561            }
3562            let wp = unsafe { squeeze_single_avx2_inplace(&mut buf, n, ch, &mut was_squeeze_char) };
3563            if wp > 0 {
3564                writer.write_all(&buf[..wp])?;
3565            }
3566        }
3567        return Ok(());
3568    }
3569
3570    // Fallback: memmem-based approach
3571    let pair = [ch, ch];
3572    let finder = memchr::memmem::Finder::new(&pair);
3573    loop {
3574        let n = read_once(reader, &mut buf)?;
3575        if n == 0 {
3576            break;
3577        }
3578        let wp = squeeze_single_compact(&mut buf, n, ch, &finder, &mut was_squeeze_char);
3579        if wp > 0 {
3580            writer.write_all(&buf[..wp])?;
3581        }
3582    }
3583    Ok(())
3584}
3585
3586/// In-place squeeze compaction for single-char using memmem.
3587#[inline]
3588fn squeeze_single_compact(
3589    buf: &mut [u8],
3590    n: usize,
3591    ch: u8,
3592    finder: &memchr::memmem::Finder<'_>,
3593    was_squeeze_char: &mut bool,
3594) -> usize {
3595    let mut i = 0;
3596
3597    // Handle carry-over from previous flush
3598    if *was_squeeze_char {
3599        while i < n && unsafe { *buf.as_ptr().add(i) } == ch {
3600            i += 1;
3601        }
3602        *was_squeeze_char = false;
3603        if i >= n {
3604            *was_squeeze_char = true;
3605            return 0;
3606        }
3607    }
3608
3609    let ptr = buf.as_mut_ptr();
3610    let mut wp = 0usize;
3611
3612    loop {
3613        match finder.find(&buf[i..n]) {
3614            Some(offset) => {
3615                let seg_end = i + offset + 1;
3616                let gap = seg_end - i;
3617                if gap > 0 {
3618                    if wp != i {
3619                        unsafe {
3620                            std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), gap);
3621                        }
3622                    }
3623                    wp += gap;
3624                }
3625                i = seg_end;
3626                while i < n && unsafe { *buf.as_ptr().add(i) } == ch {
3627                    i += 1;
3628                }
3629                if i >= n {
3630                    *was_squeeze_char = true;
3631                    break;
3632                }
3633            }
3634            None => {
3635                let rem = n - i;
3636                if rem > 0 {
3637                    if wp != i {
3638                        unsafe {
3639                            std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), rem);
3640                        }
3641                    }
3642                    wp += rem;
3643                }
3644                *was_squeeze_char = n > 0 && unsafe { *buf.as_ptr().add(n - 1) } == ch;
3645                break;
3646            }
3647        }
3648    }
3649    wp
3650}
3651
3652/// AVX2-accelerated in-place squeeze compaction for a single character.
3653///
3654/// Processes 32 bytes at a time: compare all bytes against `ch`, build a mask
3655/// of consecutive duplicates (byte == ch AND previous byte == ch), then compact
3656/// using the same COMPACT_LUT + pshufb pattern as `delete_range_avx2`.
3657///
3658/// The `carry` bit tracks whether the last byte of the previous 32-byte block
3659/// was the squeeze char, enabling correct cross-block duplicate detection.
3660///
3661/// Safety: `buf[..n]` must be valid. wp <= ri always holds since we only remove bytes.
3662#[cfg(target_arch = "x86_64")]
3663#[target_feature(enable = "avx2")]
3664unsafe fn squeeze_single_avx2_inplace(
3665    buf: &mut [u8],
3666    n: usize,
3667    ch: u8,
3668    was_squeeze_char: &mut bool,
3669) -> usize {
3670    use std::arch::x86_64::*;
3671
3672    unsafe {
3673        let ch_v = _mm256_set1_epi8(ch as i8);
3674        let ptr = buf.as_mut_ptr();
3675        let mut ri = 0;
3676        let mut wp = 0;
3677        let mut carry: u32 = if *was_squeeze_char { 1 } else { 0 };
3678
3679        while ri + 32 <= n {
3680            let input = _mm256_loadu_si256(ptr.add(ri) as *const _);
3681            let cmp = _mm256_cmpeq_epi8(input, ch_v);
3682            let sq_mask = _mm256_movemask_epi8(cmp) as u32;
3683
3684            // prev_sq_mask: bit i set if byte i-1 was the squeeze char
3685            // (shift sq_mask left by 1, fill bit 0 from carry)
3686            let prev_sq_mask = (sq_mask << 1) | carry;
3687
3688            // remove_mask: bit set where byte IS ch AND previous byte WAS ch
3689            let remove_mask = sq_mask & prev_sq_mask;
3690
3691            // carry for next block: was the last byte (bit 31) the squeeze char?
3692            carry = (sq_mask >> 31) & 1;
3693
3694            if remove_mask == 0 {
3695                // No duplicates to remove — bulk copy
3696                if wp != ri {
3697                    std::ptr::copy(ptr.add(ri), ptr.add(wp), 32);
3698                }
3699                wp += 32;
3700            } else if remove_mask != 0xFFFFFFFF {
3701                // Partial removal — per-lane compaction (same pattern as delete_range_avx2)
3702                let keep_mask = !remove_mask;
3703                let m0 = keep_mask as u8;
3704                let m1 = (keep_mask >> 8) as u8;
3705                let m2 = (keep_mask >> 16) as u8;
3706                let m3 = (keep_mask >> 24) as u8;
3707
3708                if m0 == 0xFF {
3709                    std::ptr::copy_nonoverlapping(ptr.add(ri), ptr.add(wp), 8);
3710                } else if m0 != 0 {
3711                    compact_8bytes_simd(ptr.add(ri), ptr.add(wp), m0);
3712                }
3713                let c0 = m0.count_ones() as usize;
3714
3715                if m1 == 0xFF {
3716                    std::ptr::copy_nonoverlapping(ptr.add(ri + 8), ptr.add(wp + c0), 8);
3717                } else if m1 != 0 {
3718                    compact_8bytes_simd(ptr.add(ri + 8), ptr.add(wp + c0), m1);
3719                }
3720                let c1 = m1.count_ones() as usize;
3721
3722                if m2 == 0xFF {
3723                    std::ptr::copy_nonoverlapping(ptr.add(ri + 16), ptr.add(wp + c0 + c1), 8);
3724                } else if m2 != 0 {
3725                    compact_8bytes_simd(ptr.add(ri + 16), ptr.add(wp + c0 + c1), m2);
3726                }
3727                let c2 = m2.count_ones() as usize;
3728
3729                if m3 == 0xFF {
3730                    std::ptr::copy_nonoverlapping(ptr.add(ri + 24), ptr.add(wp + c0 + c1 + c2), 8);
3731                } else if m3 != 0 {
3732                    compact_8bytes_simd(ptr.add(ri + 24), ptr.add(wp + c0 + c1 + c2), m3);
3733                }
3734                let c3 = m3.count_ones() as usize;
3735                wp += c0 + c1 + c2 + c3;
3736            }
3737            // else: remove_mask == 0xFFFFFFFF means all 32 bytes are duplicate squeeze chars, skip
3738
3739            ri += 32;
3740        }
3741
3742        // SSE2 tail for 16-byte remainder
3743        if ri + 16 <= n {
3744            let ch_v128 = _mm_set1_epi8(ch as i8);
3745            let input = _mm_loadu_si128(ptr.add(ri) as *const _);
3746            let cmp = _mm_cmpeq_epi8(input, ch_v128);
3747            let sq_mask = _mm_movemask_epi8(cmp) as u32 & 0xFFFF;
3748            let prev_sq_mask = (sq_mask << 1) | carry;
3749            let remove_mask = sq_mask & prev_sq_mask;
3750            carry = (sq_mask >> 15) & 1;
3751
3752            if remove_mask == 0 {
3753                if wp != ri {
3754                    std::ptr::copy(ptr.add(ri), ptr.add(wp), 16);
3755                }
3756                wp += 16;
3757            } else if remove_mask != 0xFFFF {
3758                let keep_mask = !remove_mask;
3759                let m0 = keep_mask as u8;
3760                let m1 = (keep_mask >> 8) as u8;
3761                if m0 == 0xFF {
3762                    std::ptr::copy_nonoverlapping(ptr.add(ri), ptr.add(wp), 8);
3763                } else if m0 != 0 {
3764                    compact_8bytes_simd(ptr.add(ri), ptr.add(wp), m0);
3765                }
3766                let c0 = m0.count_ones() as usize;
3767                if m1 == 0xFF {
3768                    std::ptr::copy_nonoverlapping(ptr.add(ri + 8), ptr.add(wp + c0), 8);
3769                } else if m1 != 0 {
3770                    compact_8bytes_simd(ptr.add(ri + 8), ptr.add(wp + c0), m1);
3771                }
3772                wp += c0 + m1.count_ones() as usize;
3773            }
3774            ri += 16;
3775        }
3776
3777        // Scalar tail for remaining bytes
3778        while ri < n {
3779            let b = *ptr.add(ri);
3780            if b == ch && carry != 0 {
3781                // Duplicate squeeze char — skip it
3782            } else {
3783                *ptr.add(wp) = b;
3784                wp += 1;
3785            }
3786            carry = if b == ch { 1 } else { 0 };
3787            ri += 1;
3788        }
3789
3790        *was_squeeze_char = carry != 0;
3791        wp
3792    }
3793}
3794
3795// ============================================================================
3796// Batch in-place functions (owned data from piped stdin)
3797// ============================================================================
3798
3799/// Translate bytes in-place on an owned buffer, then write.
3800/// For piped stdin where we own the data, this avoids the separate output buffer
3801/// allocation needed by translate_mmap. Uses parallel in-place SIMD for large data.
3802pub fn translate_owned(
3803    set1: &[u8],
3804    set2: &[u8],
3805    data: &mut [u8],
3806    writer: &mut impl Write,
3807) -> io::Result<()> {
3808    let table = build_translate_table(set1, set2);
3809
3810    // Identity table — pure passthrough
3811    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
3812    if is_identity {
3813        return writer.write_all(data);
3814    }
3815
3816    // Parallel translate was a 105% regression at 10MB benchmark size.
3817    // Single-threaded AVX2 is fast enough. Only parallelize for genuinely
3818    // large files (>=64MB) where multi-core benefits clearly exceed overhead.
3819    const OWNED_PARALLEL_MIN: usize = 64 * 1024 * 1024;
3820
3821    // SIMD range fast path (in-place)
3822    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
3823        if data.len() >= OWNED_PARALLEL_MIN {
3824            let n_threads = rayon::current_num_threads().max(1);
3825            let chunk_size = (data.len() / n_threads).max(32 * 1024);
3826            data.par_chunks_mut(chunk_size).for_each(|chunk| {
3827                translate_range_simd_inplace(chunk, lo, hi, offset);
3828            });
3829        } else {
3830            translate_range_simd_inplace(data, lo, hi, offset);
3831        }
3832        return writer.write_all(data);
3833    }
3834
3835    // SIMD range-to-constant fast path (in-place)
3836    if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
3837        if data.len() >= OWNED_PARALLEL_MIN {
3838            let n_threads = rayon::current_num_threads().max(1);
3839            let chunk_size = (data.len() / n_threads).max(32 * 1024);
3840            data.par_chunks_mut(chunk_size).for_each(|chunk| {
3841                translate_range_to_constant_simd_inplace(chunk, lo, hi, replacement);
3842            });
3843        } else {
3844            translate_range_to_constant_simd_inplace(data, lo, hi, replacement);
3845        }
3846        return writer.write_all(data);
3847    }
3848
3849    // General table lookup (in-place)
3850    if data.len() >= OWNED_PARALLEL_MIN {
3851        let n_threads = rayon::current_num_threads().max(1);
3852        let chunk_size = (data.len() / n_threads).max(32 * 1024);
3853        data.par_chunks_mut(chunk_size).for_each(|chunk| {
3854            translate_inplace(chunk, &table);
3855        });
3856    } else {
3857        translate_inplace(data, &table);
3858    }
3859    writer.write_all(data)
3860}
3861
3862// ============================================================================
3863// Mmap-based functions (zero-copy input from byte slice)
3864// ============================================================================
3865
3866/// Maximum data size for single-allocation translate approach.
3867/// Translate bytes from an mmap'd byte slice.
3868/// Detects single-range translations (e.g., a-z to A-Z) and uses SIMD vectorized
3869/// arithmetic (AVX2: 32 bytes/iter, SSE2: 16 bytes/iter) for those cases.
3870/// Falls back to scalar 256-byte table lookup for general translations.
3871///
3872/// For data >= 2MB: uses rayon parallel processing across multiple cores.
3873/// For data <= 16MB: single allocation + single write_all (1 syscall).
3874/// For data > 16MB: chunked approach to limit memory (N syscalls where N = data/4MB).
3875pub fn translate_mmap(
3876    set1: &[u8],
3877    set2: &[u8],
3878    data: &[u8],
3879    writer: &mut impl Write,
3880) -> io::Result<()> {
3881    let table = build_translate_table(set1, set2);
3882
3883    // Check if table is identity — pure passthrough
3884    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
3885    if is_identity {
3886        return writer.write_all(data);
3887    }
3888
3889    // Try SIMD fast path for single-range constant-offset translations
3890    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
3891        return translate_mmap_range(data, writer, lo, hi, offset);
3892    }
3893
3894    // Try SIMD fast path for range-to-constant translations
3895    if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
3896        return translate_mmap_range_to_constant(data, writer, lo, hi, replacement);
3897    }
3898
3899    // General case: table lookup (with parallel processing for large data)
3900    translate_mmap_table(data, writer, &table)
3901}
3902
3903/// SIMD range translate for mmap data, with rayon parallel processing.
3904fn translate_mmap_range(
3905    data: &[u8],
3906    writer: &mut impl Write,
3907    lo: u8,
3908    hi: u8,
3909    offset: i8,
3910) -> io::Result<()> {
3911    // Parallel path: split data into chunks, translate each in parallel
3912    if data.len() >= PARALLEL_THRESHOLD {
3913        let mut buf = alloc_uninit_vec(data.len());
3914        let n_threads = rayon::current_num_threads().max(1);
3915        let chunk_size = (data.len() / n_threads).max(32 * 1024);
3916
3917        // Process chunks in parallel: each thread writes to its slice of buf
3918        data.par_chunks(chunk_size)
3919            .zip(buf.par_chunks_mut(chunk_size))
3920            .for_each(|(src_chunk, dst_chunk)| {
3921                translate_range_simd(src_chunk, &mut dst_chunk[..src_chunk.len()], lo, hi, offset);
3922            });
3923
3924        return writer.write_all(&buf);
3925    }
3926
3927    // Chunked SIMD translate: 256KB buffer fits in L2 cache.
3928    const CHUNK: usize = 256 * 1024;
3929    let buf_size = data.len().min(CHUNK);
3930    let mut buf = alloc_uninit_vec(buf_size);
3931    for chunk in data.chunks(CHUNK) {
3932        translate_range_simd(chunk, &mut buf[..chunk.len()], lo, hi, offset);
3933        writer.write_all(&buf[..chunk.len()])?;
3934    }
3935    Ok(())
3936}
3937
3938/// SIMD range-to-constant translate for mmap data.
3939/// Uses blendv (5 SIMD ops/32 bytes) for range-to-constant patterns.
3940fn translate_mmap_range_to_constant(
3941    data: &[u8],
3942    writer: &mut impl Write,
3943    lo: u8,
3944    hi: u8,
3945    replacement: u8,
3946) -> io::Result<()> {
3947    // For mmap data (read-only), copy to buffer and translate in-place
3948    if data.len() >= PARALLEL_THRESHOLD {
3949        let mut buf = alloc_uninit_vec(data.len());
3950        let n_threads = rayon::current_num_threads().max(1);
3951        let chunk_size = (data.len() / n_threads).max(32 * 1024);
3952
3953        // Copy + translate in parallel
3954        data.par_chunks(chunk_size)
3955            .zip(buf.par_chunks_mut(chunk_size))
3956            .for_each(|(src_chunk, dst_chunk)| {
3957                dst_chunk[..src_chunk.len()].copy_from_slice(src_chunk);
3958                translate_range_to_constant_simd_inplace(
3959                    &mut dst_chunk[..src_chunk.len()],
3960                    lo,
3961                    hi,
3962                    replacement,
3963                );
3964            });
3965
3966        return writer.write_all(&buf);
3967    }
3968
3969    // Chunked translate: 256KB buffer fits in L2 cache.
3970    const CHUNK: usize = 256 * 1024;
3971    let buf_size = data.len().min(CHUNK);
3972    let mut buf = alloc_uninit_vec(buf_size);
3973    for chunk in data.chunks(CHUNK) {
3974        buf[..chunk.len()].copy_from_slice(chunk);
3975        translate_range_to_constant_simd_inplace(&mut buf[..chunk.len()], lo, hi, replacement);
3976        writer.write_all(&buf[..chunk.len()])?;
3977    }
3978    Ok(())
3979}
3980
3981/// General table-lookup translate for mmap data, with rayon parallel processing.
3982fn translate_mmap_table(data: &[u8], writer: &mut impl Write, table: &[u8; 256]) -> io::Result<()> {
3983    // Parallel path: split data into chunks, translate each in parallel
3984    if data.len() >= PARALLEL_THRESHOLD {
3985        let mut buf = alloc_uninit_vec(data.len());
3986        let n_threads = rayon::current_num_threads().max(1);
3987        let chunk_size = (data.len() / n_threads).max(32 * 1024);
3988
3989        data.par_chunks(chunk_size)
3990            .zip(buf.par_chunks_mut(chunk_size))
3991            .for_each(|(src_chunk, dst_chunk)| {
3992                translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], table);
3993            });
3994
3995        return writer.write_all(&buf);
3996    }
3997
3998    // Chunked translate: 256KB buffer fits in L2 cache.
3999    const CHUNK: usize = 256 * 1024;
4000    let buf_size = data.len().min(CHUNK);
4001    let mut buf = alloc_uninit_vec(buf_size);
4002    for chunk in data.chunks(CHUNK) {
4003        translate_to(chunk, &mut buf[..chunk.len()], table);
4004        writer.write_all(&buf[..chunk.len()])?;
4005    }
4006    Ok(())
4007}
4008
4009/// Translate bytes in-place on a mutable buffer (e.g., MAP_PRIVATE mmap).
4010/// Eliminates the output buffer allocation entirely — the kernel's COW
4011/// semantics mean only modified pages are physically copied.
4012///
4013/// For data >= PARALLEL_THRESHOLD: rayon parallel in-place translate.
4014/// Otherwise: single-threaded in-place translate.
4015pub fn translate_mmap_inplace(
4016    set1: &[u8],
4017    set2: &[u8],
4018    data: &mut [u8],
4019    writer: &mut impl Write,
4020) -> io::Result<()> {
4021    let table = build_translate_table(set1, set2);
4022
4023    // Check if table is identity — pure passthrough
4024    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
4025    if is_identity {
4026        return writer.write_all(data);
4027    }
4028
4029    // Always translate in-place. With MAP_PRIVATE + MADV_HUGEPAGE, COW faults
4030    // use 2MB pages — even for 10MB files only ~5 COW faults (~10µs), which is
4031    // far cheaper than allocating a separate 10MB output buffer (~300µs).
4032    // This eliminates the output buffer allocation + data copy entirely.
4033
4034    // Try SIMD fast path for single-range constant-offset translations (e.g., a-z -> A-Z)
4035    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
4036        if data.len() >= PARALLEL_THRESHOLD {
4037            let n_threads = rayon::current_num_threads().max(1);
4038            let chunk_size = (data.len() / n_threads).max(32 * 1024);
4039            data.par_chunks_mut(chunk_size)
4040                .for_each(|chunk| translate_range_simd_inplace(chunk, lo, hi, offset));
4041        } else {
4042            translate_range_simd_inplace(data, lo, hi, offset);
4043        }
4044        return writer.write_all(data);
4045    }
4046
4047    // Try SIMD fast path for range-to-constant translations
4048    if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
4049        if data.len() >= PARALLEL_THRESHOLD {
4050            let n_threads = rayon::current_num_threads().max(1);
4051            let chunk_size = (data.len() / n_threads).max(32 * 1024);
4052            data.par_chunks_mut(chunk_size).for_each(|chunk| {
4053                translate_range_to_constant_simd_inplace(chunk, lo, hi, replacement)
4054            });
4055        } else {
4056            translate_range_to_constant_simd_inplace(data, lo, hi, replacement);
4057        }
4058        return writer.write_all(data);
4059    }
4060
4061    // General case: in-place table lookup
4062    if data.len() >= PARALLEL_THRESHOLD {
4063        let n_threads = rayon::current_num_threads().max(1);
4064        let chunk_size = (data.len() / n_threads).max(32 * 1024);
4065        data.par_chunks_mut(chunk_size)
4066            .for_each(|chunk| translate_inplace(chunk, &table));
4067    } else {
4068        translate_inplace(data, &table);
4069    }
4070    writer.write_all(data)
4071}
4072
4073/// Translate from read-only source to a separate output buffer, avoiding COW faults.
4074/// Uses the appropriate SIMD path (range offset, range-to-constant, or general nibble).
4075///
4076/// For data >= PARALLEL_THRESHOLD: parallel chunked translate into full-size buffer.
4077/// For smaller data: single full-size allocation + single write_all for minimum
4078/// syscall overhead. At 10MB, the allocation is cheap and a single write() is faster
4079/// than multiple 4MB chunked writes.
4080fn translate_to_separate_buf(
4081    data: &[u8],
4082    table: &[u8; 256],
4083    writer: &mut impl Write,
4084) -> io::Result<()> {
4085    let range_info = detect_range_offset(table);
4086    let const_info = if range_info.is_none() {
4087        detect_range_to_constant(table)
4088    } else {
4089        None
4090    };
4091
4092    if data.len() >= PARALLEL_THRESHOLD {
4093        // Parallel path: full-size output buffer, parallel translate, single write.
4094        let mut out_buf = alloc_uninit_vec(data.len());
4095        let n_threads = rayon::current_num_threads().max(1);
4096        let chunk_size = (data.len() / n_threads).max(32 * 1024);
4097
4098        if let Some((lo, hi, offset)) = range_info {
4099            data.par_chunks(chunk_size)
4100                .zip(out_buf.par_chunks_mut(chunk_size))
4101                .for_each(|(src, dst)| {
4102                    translate_range_simd(src, &mut dst[..src.len()], lo, hi, offset);
4103                });
4104        } else if let Some((lo, hi, replacement)) = const_info {
4105            data.par_chunks(chunk_size)
4106                .zip(out_buf.par_chunks_mut(chunk_size))
4107                .for_each(|(src, dst)| {
4108                    translate_range_to_constant_simd(
4109                        src,
4110                        &mut dst[..src.len()],
4111                        lo,
4112                        hi,
4113                        replacement,
4114                    );
4115                });
4116        } else {
4117            data.par_chunks(chunk_size)
4118                .zip(out_buf.par_chunks_mut(chunk_size))
4119                .for_each(|(src, dst)| {
4120                    translate_to(src, &mut dst[..src.len()], table);
4121                });
4122        }
4123        return writer.write_all(&out_buf);
4124    }
4125
4126    // Single-allocation translate: full-size output buffer, single translate, single write.
4127    // For 10MB data, this does 1 write() instead of 40 chunked writes, eliminating
4128    // 39 write() syscalls. SIMD translate streams through src and dst sequentially,
4129    // so the L2 cache argument for 256KB chunks doesn't apply (src data doesn't fit
4130    // in L2 anyway). The reduced syscall overhead more than compensates.
4131    let mut out_buf = alloc_uninit_vec(data.len());
4132    if let Some((lo, hi, offset)) = range_info {
4133        translate_range_simd(data, &mut out_buf, lo, hi, offset);
4134    } else if let Some((lo, hi, replacement)) = const_info {
4135        translate_range_to_constant_simd(data, &mut out_buf, lo, hi, replacement);
4136    } else {
4137        translate_to(data, &mut out_buf, table);
4138    }
4139    writer.write_all(&out_buf)
4140}
4141
4142/// Translate from a read-only mmap (or any byte slice) to a separate output buffer.
4143/// Avoids MAP_PRIVATE COW page faults by reading from the original data and
4144/// writing to a freshly allocated heap buffer.
4145pub fn translate_mmap_readonly(
4146    set1: &[u8],
4147    set2: &[u8],
4148    data: &[u8],
4149    writer: &mut impl Write,
4150) -> io::Result<()> {
4151    let table = build_translate_table(set1, set2);
4152
4153    // Check if table is identity — pure passthrough
4154    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
4155    if is_identity {
4156        return writer.write_all(data);
4157    }
4158
4159    translate_to_separate_buf(data, &table, writer)
4160}
4161
4162/// Translate + squeeze from mmap'd byte slice.
4163///
4164/// For data >= 2MB: two-phase approach: parallel translate, then sequential squeeze.
4165/// For data <= 16MB: single-pass translate+squeeze into one buffer, one write syscall.
4166/// For data > 16MB: chunked approach to limit memory.
4167pub fn translate_squeeze_mmap(
4168    set1: &[u8],
4169    set2: &[u8],
4170    data: &[u8],
4171    writer: &mut impl Write,
4172) -> io::Result<()> {
4173    let table = build_translate_table(set1, set2);
4174    let squeeze_set = build_member_set(set2);
4175
4176    // For large data: two-phase approach
4177    // Phase 1: parallel translate into buffer
4178    // Phase 2: sequential squeeze IN-PLACE on the translated buffer
4179    //          (squeeze only removes bytes, never grows, so no second allocation needed)
4180    if data.len() >= PARALLEL_THRESHOLD {
4181        // Phase 1: parallel translate
4182        let mut translated = alloc_uninit_vec(data.len());
4183        let range_info = detect_range_offset(&table);
4184        let n_threads = rayon::current_num_threads().max(1);
4185        let chunk_size = (data.len() / n_threads).max(32 * 1024);
4186
4187        if let Some((lo, hi, offset)) = range_info {
4188            data.par_chunks(chunk_size)
4189                .zip(translated.par_chunks_mut(chunk_size))
4190                .for_each(|(src_chunk, dst_chunk)| {
4191                    translate_range_simd(
4192                        src_chunk,
4193                        &mut dst_chunk[..src_chunk.len()],
4194                        lo,
4195                        hi,
4196                        offset,
4197                    );
4198                });
4199        } else {
4200            data.par_chunks(chunk_size)
4201                .zip(translated.par_chunks_mut(chunk_size))
4202                .for_each(|(src_chunk, dst_chunk)| {
4203                    translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], &table);
4204                });
4205        }
4206
4207        // Phase 2: squeeze in-place on the translated buffer.
4208        // Since squeeze only removes bytes (never grows), we can read ahead and
4209        // compact into the same buffer, saving a full data.len() heap allocation.
4210        let mut last_squeezed: u16 = 256;
4211        let len = translated.len();
4212        let mut wp = 0;
4213        unsafe {
4214            let ptr = translated.as_mut_ptr();
4215            let mut i = 0;
4216            while i < len {
4217                let b = *ptr.add(i);
4218                if is_member(&squeeze_set, b) {
4219                    if last_squeezed == b as u16 {
4220                        i += 1;
4221                        continue;
4222                    }
4223                    last_squeezed = b as u16;
4224                } else {
4225                    last_squeezed = 256;
4226                }
4227                *ptr.add(wp) = b;
4228                wp += 1;
4229                i += 1;
4230            }
4231        }
4232        return writer.write_all(&translated[..wp]);
4233    }
4234
4235    // Single-allocation translate+squeeze: full-size buffer, single write_all.
4236    // For 10MB data, this does 1 write() instead of ~40 chunked writes.
4237    let mut buf = alloc_uninit_vec(data.len());
4238    translate_to(data, &mut buf, &table);
4239    let mut last_squeezed: u16 = 256;
4240    let mut wp = 0;
4241    unsafe {
4242        let ptr = buf.as_mut_ptr();
4243        for i in 0..data.len() {
4244            let b = *ptr.add(i);
4245            if is_member(&squeeze_set, b) {
4246                if last_squeezed == b as u16 {
4247                    continue;
4248                }
4249                last_squeezed = b as u16;
4250            } else {
4251                last_squeezed = 256;
4252            }
4253            *ptr.add(wp) = b;
4254            wp += 1;
4255        }
4256    }
4257    writer.write_all(&buf[..wp])
4258}
4259
4260/// Delete from mmap'd byte slice.
4261///
4262/// For data >= 2MB: uses rayon parallel processing across multiple cores.
4263/// For data <= 16MB: delete into one buffer, one write syscall.
4264/// For data > 16MB: chunked approach to limit memory.
4265pub fn delete_mmap(delete_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4266    if delete_chars.len() == 1 {
4267        return delete_single_char_mmap(delete_chars[0], data, writer);
4268    }
4269    if delete_chars.len() <= 3 {
4270        return delete_multi_memchr_mmap(delete_chars, data, writer);
4271    }
4272
4273    // SIMD fast path for contiguous ranges (digits, a-z, A-Z, etc.)
4274    if let Some((lo, hi)) = detect_delete_range(delete_chars) {
4275        return delete_range_mmap(data, writer, lo, hi);
4276    }
4277
4278    let member = build_member_set(delete_chars);
4279
4280    // Heuristic: estimate total delete positions. Zero-copy writev is only efficient
4281    // when all gaps fit in a single writev call (< MAX_IOV/2 entries). With uniform
4282    // distribution, each delete creates an IoSlice entry. For many deletes (> 512),
4283    // multiple writev calls are needed, and the compact approach is faster.
4284    let sample_size = data.len().min(1024);
4285    let sample_deletes = data[..sample_size]
4286        .iter()
4287        .filter(|&&b| is_member(&member, b))
4288        .count();
4289    let estimated_deletes = if sample_size > 0 {
4290        data.len() * sample_deletes / sample_size
4291    } else {
4292        data.len()
4293    };
4294
4295    if estimated_deletes < MAX_IOV / 2 {
4296        return delete_bitset_zerocopy(data, &member, writer);
4297    }
4298
4299    // Dense delete: parallel compact with writev (avoids scatter-gather copy)
4300    if data.len() >= PARALLEL_THRESHOLD {
4301        let n_threads = rayon::current_num_threads().max(1);
4302        let chunk_size = (data.len() / n_threads).max(32 * 1024);
4303
4304        let mut outbuf = alloc_uninit_vec(data.len());
4305        let chunk_lens: Vec<usize> = data
4306            .par_chunks(chunk_size)
4307            .zip(outbuf.par_chunks_mut(chunk_size))
4308            .map(|(src_chunk, dst_chunk)| delete_chunk_bitset_into(src_chunk, &member, dst_chunk))
4309            .collect();
4310
4311        // Use writev to write each chunk at its original position, avoiding
4312        // the O(N) scatter-gather memmove. With ~4 threads, that's 4 IoSlice
4313        // entries — far below MAX_IOV.
4314        let slices: Vec<std::io::IoSlice> = chunk_lens
4315            .iter()
4316            .enumerate()
4317            .filter(|&(_, &len)| len > 0)
4318            .map(|(i, &len)| std::io::IoSlice::new(&outbuf[i * chunk_size..i * chunk_size + len]))
4319            .collect();
4320        return write_ioslices(writer, &slices);
4321    }
4322
4323    // Streaming compact: 256KB output buffer reduces page fault overhead.
4324    // For 10MB data: ~64 page faults instead of ~2500, with ~40 write_all calls.
4325    const COMPACT_BUF: usize = 256 * 1024;
4326    let mut outbuf = alloc_uninit_vec(COMPACT_BUF);
4327
4328    for chunk in data.chunks(COMPACT_BUF) {
4329        let out_pos = delete_chunk_bitset_into(chunk, &member, &mut outbuf);
4330        if out_pos > 0 {
4331            writer.write_all(&outbuf[..out_pos])?;
4332        }
4333    }
4334    Ok(())
4335}
4336
4337/// SIMD range delete for mmap data.
4338/// Uses a density heuristic: for sparse deletes (< 15%), uses zero-copy writev
4339/// directly from mmap data (no output buffer allocation). For dense deletes,
4340/// uses SIMD compact into a pre-allocated buffer.
4341fn delete_range_mmap(data: &[u8], writer: &mut impl Write, lo: u8, hi: u8) -> io::Result<()> {
4342    // Sample first 1024 bytes to estimate delete density
4343    let sample_size = data.len().min(1024);
4344    let sample_deletes = data[..sample_size]
4345        .iter()
4346        .filter(|&&b| b >= lo && b <= hi)
4347        .count();
4348    // Estimate expected number of delete positions (IoSlice entries for zero-copy).
4349    // Each delete creates an IoSlice entry. With MAX_IOV=1024 per writev,
4350    // if estimated_deletes > MAX_IOV/2, the writev overhead from multiple syscalls
4351    // exceeds the compact approach cost. Only use zero-copy when all gaps fit in
4352    // a single writev call.
4353    let estimated_deletes = if sample_size > 0 {
4354        data.len() * sample_deletes / sample_size
4355    } else {
4356        data.len()
4357    };
4358    if estimated_deletes < MAX_IOV / 2 {
4359        return delete_range_mmap_zerocopy(data, writer, lo, hi);
4360    }
4361
4362    // Dense deletes: parallel compact with writev (avoids scatter-gather copy)
4363    if data.len() >= PARALLEL_THRESHOLD {
4364        let n_threads = rayon::current_num_threads().max(1);
4365        let chunk_size = (data.len() / n_threads).max(32 * 1024);
4366
4367        let mut outbuf = alloc_uninit_vec(data.len());
4368        let chunk_lens: Vec<usize> = data
4369            .par_chunks(chunk_size)
4370            .zip(outbuf.par_chunks_mut(chunk_size))
4371            .map(|(src_chunk, dst_chunk)| delete_range_chunk(src_chunk, dst_chunk, lo, hi))
4372            .collect();
4373
4374        // Use writev to write each chunk at its original position, avoiding
4375        // the O(N) scatter-gather memmove.
4376        let slices: Vec<std::io::IoSlice> = chunk_lens
4377            .iter()
4378            .enumerate()
4379            .filter(|&(_, &len)| len > 0)
4380            .map(|(i, &len)| std::io::IoSlice::new(&outbuf[i * chunk_size..i * chunk_size + len]))
4381            .collect();
4382        return write_ioslices(writer, &slices);
4383    }
4384
4385    // Streaming compact: use 256KB output buffer instead of full data.len() buffer.
4386    // This reduces page fault overhead from ~2500 faults (10MB) to ~64 faults (256KB).
4387    // The extra write_all calls (~40 for 10MB) are negligible cost.
4388    const COMPACT_BUF: usize = 256 * 1024;
4389    let mut outbuf = alloc_uninit_vec(COMPACT_BUF);
4390
4391    #[cfg(target_arch = "x86_64")]
4392    {
4393        let mut wp = 0;
4394        let level = get_simd_level();
4395        let len = data.len();
4396        let sp = data.as_ptr();
4397        let dp = outbuf.as_mut_ptr();
4398        let mut ri = 0;
4399
4400        if level >= 3 {
4401            use std::arch::x86_64::*;
4402            let range = hi - lo;
4403            let bias_v = unsafe { _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8) };
4404            let threshold_v = unsafe { _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8) };
4405            let zero = unsafe { _mm256_setzero_si256() };
4406
4407            while ri + 32 <= len {
4408                // Flush when output buffer is nearly full
4409                if wp + 32 > COMPACT_BUF {
4410                    writer.write_all(&outbuf[..wp])?;
4411                    wp = 0;
4412                }
4413
4414                let input = unsafe { _mm256_loadu_si256(sp.add(ri) as *const _) };
4415                let biased = unsafe { _mm256_add_epi8(input, bias_v) };
4416                let gt = unsafe { _mm256_cmpgt_epi8(biased, threshold_v) };
4417                let in_range = unsafe { _mm256_cmpeq_epi8(gt, zero) };
4418                let keep_mask = !(unsafe { _mm256_movemask_epi8(in_range) } as u32);
4419
4420                if keep_mask == 0xFFFFFFFF {
4421                    unsafe { std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 32) };
4422                    wp += 32;
4423                } else if keep_mask != 0 {
4424                    let m0 = keep_mask as u8;
4425                    let m1 = (keep_mask >> 8) as u8;
4426                    let m2 = (keep_mask >> 16) as u8;
4427                    let m3 = (keep_mask >> 24) as u8;
4428
4429                    if m0 == 0xFF {
4430                        unsafe { std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8) };
4431                    } else if m0 != 0 {
4432                        unsafe { compact_8bytes_simd(sp.add(ri), dp.add(wp), m0) };
4433                    }
4434                    let c0 = m0.count_ones() as usize;
4435
4436                    if m1 == 0xFF {
4437                        unsafe {
4438                            std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8)
4439                        };
4440                    } else if m1 != 0 {
4441                        unsafe { compact_8bytes_simd(sp.add(ri + 8), dp.add(wp + c0), m1) };
4442                    }
4443                    let c1 = m1.count_ones() as usize;
4444
4445                    if m2 == 0xFF {
4446                        unsafe {
4447                            std::ptr::copy_nonoverlapping(sp.add(ri + 16), dp.add(wp + c0 + c1), 8)
4448                        };
4449                    } else if m2 != 0 {
4450                        unsafe { compact_8bytes_simd(sp.add(ri + 16), dp.add(wp + c0 + c1), m2) };
4451                    }
4452                    let c2 = m2.count_ones() as usize;
4453
4454                    if m3 == 0xFF {
4455                        unsafe {
4456                            std::ptr::copy_nonoverlapping(
4457                                sp.add(ri + 24),
4458                                dp.add(wp + c0 + c1 + c2),
4459                                8,
4460                            )
4461                        };
4462                    } else if m3 != 0 {
4463                        unsafe {
4464                            compact_8bytes_simd(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), m3)
4465                        };
4466                    }
4467                    let c3 = m3.count_ones() as usize;
4468                    wp += c0 + c1 + c2 + c3;
4469                }
4470                ri += 32;
4471            }
4472        }
4473
4474        // Scalar tail
4475        while ri < len {
4476            if wp + 1 > COMPACT_BUF {
4477                writer.write_all(&outbuf[..wp])?;
4478                wp = 0;
4479            }
4480            let b = unsafe { *sp.add(ri) };
4481            unsafe { *dp.add(wp) = b };
4482            wp += (b < lo || b > hi) as usize;
4483            ri += 1;
4484        }
4485
4486        if wp > 0 {
4487            writer.write_all(&outbuf[..wp])?;
4488        }
4489        return Ok(());
4490    }
4491
4492    #[cfg(not(target_arch = "x86_64"))]
4493    {
4494        // Non-x86 fallback: chunk the source and process with delete_range_chunk
4495        for chunk in data.chunks(COMPACT_BUF) {
4496            let clen = delete_range_chunk(chunk, &mut outbuf, lo, hi);
4497            if clen > 0 {
4498                writer.write_all(&outbuf[..clen])?;
4499            }
4500        }
4501        return Ok(());
4502    }
4503
4504    #[allow(unreachable_code)]
4505    Ok(())
4506}
4507
4508/// Zero-copy range delete for mmap data: SIMD-scans for bytes in [lo..=hi],
4509/// builds IoSlice entries pointing to the gaps between deleted ranges in the
4510/// original mmap data, and writes using writev. No output buffer allocation.
4511/// For 10MB text with 4% digits: ~1.5ms vs ~4ms for the compact approach.
4512fn delete_range_mmap_zerocopy(
4513    data: &[u8],
4514    writer: &mut impl Write,
4515    lo: u8,
4516    hi: u8,
4517) -> io::Result<()> {
4518    #[cfg(target_arch = "x86_64")]
4519    {
4520        if get_simd_level() >= 3 {
4521            return unsafe { delete_range_zerocopy_avx2(data, writer, lo, hi) };
4522        }
4523        if get_simd_level() >= 2 {
4524            return unsafe { delete_range_zerocopy_sse2(data, writer, lo, hi) };
4525        }
4526    }
4527
4528    #[cfg(target_arch = "aarch64")]
4529    {
4530        return unsafe { delete_range_zerocopy_neon(data, writer, lo, hi) };
4531    }
4532
4533    // Scalar fallback: byte-by-byte scan with IoSlice batching
4534    #[allow(unreachable_code)]
4535    delete_range_zerocopy_scalar(data, writer, lo, hi)
4536}
4537
4538/// Scalar zero-copy range delete: byte-by-byte scan with IoSlice batching.
4539/// Used as fallback when SIMD is unavailable.
4540fn delete_range_zerocopy_scalar(
4541    data: &[u8],
4542    writer: &mut impl Write,
4543    lo: u8,
4544    hi: u8,
4545) -> io::Result<()> {
4546    let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4547    let len = data.len();
4548    let mut run_start: usize = 0;
4549    let mut i: usize = 0;
4550
4551    while i < len {
4552        let b = unsafe { *data.get_unchecked(i) };
4553        if b >= lo && b <= hi {
4554            if i > run_start {
4555                iov.push(std::io::IoSlice::new(&data[run_start..i]));
4556                if iov.len() >= MAX_IOV {
4557                    write_ioslices(writer, &iov)?;
4558                    iov.clear();
4559                }
4560            }
4561            run_start = i + 1;
4562        }
4563        i += 1;
4564    }
4565    if run_start < len {
4566        iov.push(std::io::IoSlice::new(&data[run_start..]));
4567    }
4568    if !iov.is_empty() {
4569        write_ioslices(writer, &iov)?;
4570    }
4571    Ok(())
4572}
4573
4574/// AVX2 zero-copy range delete: scans 32 bytes at a time using SIMD range
4575/// comparison, then iterates only the delete positions from the bitmask.
4576/// Blocks with no deletes (common for sparse data) skip with zero per-byte work.
4577#[cfg(target_arch = "x86_64")]
4578#[target_feature(enable = "avx2")]
4579unsafe fn delete_range_zerocopy_avx2(
4580    data: &[u8],
4581    writer: &mut impl Write,
4582    lo: u8,
4583    hi: u8,
4584) -> io::Result<()> {
4585    use std::arch::x86_64::*;
4586
4587    unsafe {
4588        let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4589        let len = data.len();
4590        let mut run_start: usize = 0;
4591        let mut ri: usize = 0;
4592
4593        let range = hi - lo;
4594        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
4595        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
4596        let zero = _mm256_setzero_si256();
4597
4598        while ri + 32 <= len {
4599            let input = _mm256_loadu_si256(data.as_ptr().add(ri) as *const _);
4600            let biased = _mm256_add_epi8(input, bias_v);
4601            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
4602            let in_range = _mm256_cmpeq_epi8(gt, zero);
4603            let del_mask = _mm256_movemask_epi8(in_range) as u32;
4604
4605            if del_mask == 0 {
4606                // No bytes to delete — run continues
4607                ri += 32;
4608                continue;
4609            }
4610
4611            // Process each deleted byte position from the bitmask
4612            let mut m = del_mask;
4613            while m != 0 {
4614                let bit = m.trailing_zeros() as usize;
4615                let abs_pos = ri + bit;
4616                if abs_pos > run_start {
4617                    iov.push(std::io::IoSlice::new(&data[run_start..abs_pos]));
4618                    if iov.len() >= MAX_IOV {
4619                        write_ioslices(writer, &iov)?;
4620                        iov.clear();
4621                    }
4622                }
4623                run_start = abs_pos + 1;
4624                m &= m - 1; // clear lowest set bit (blsr)
4625            }
4626
4627            ri += 32;
4628        }
4629
4630        // Scalar tail
4631        while ri < len {
4632            let b = *data.get_unchecked(ri);
4633            if b >= lo && b <= hi {
4634                if ri > run_start {
4635                    iov.push(std::io::IoSlice::new(&data[run_start..ri]));
4636                    if iov.len() >= MAX_IOV {
4637                        write_ioslices(writer, &iov)?;
4638                        iov.clear();
4639                    }
4640                }
4641                run_start = ri + 1;
4642            }
4643            ri += 1;
4644        }
4645
4646        if run_start < len {
4647            iov.push(std::io::IoSlice::new(&data[run_start..]));
4648        }
4649        if !iov.is_empty() {
4650            write_ioslices(writer, &iov)?;
4651        }
4652        Ok(())
4653    }
4654}
4655
4656/// SSE2 zero-copy range delete: same approach as AVX2 but with 16-byte blocks.
4657#[cfg(target_arch = "x86_64")]
4658#[target_feature(enable = "sse2")]
4659unsafe fn delete_range_zerocopy_sse2(
4660    data: &[u8],
4661    writer: &mut impl Write,
4662    lo: u8,
4663    hi: u8,
4664) -> io::Result<()> {
4665    use std::arch::x86_64::*;
4666
4667    unsafe {
4668        let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4669        let len = data.len();
4670        let mut run_start: usize = 0;
4671        let mut ri: usize = 0;
4672
4673        let range = hi - lo;
4674        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
4675        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
4676        let zero = _mm_setzero_si128();
4677
4678        while ri + 16 <= len {
4679            let input = _mm_loadu_si128(data.as_ptr().add(ri) as *const _);
4680            let biased = _mm_add_epi8(input, bias_v);
4681            let gt = _mm_cmpgt_epi8(biased, threshold_v);
4682            let in_range = _mm_cmpeq_epi8(gt, zero);
4683            let del_mask = _mm_movemask_epi8(in_range) as u32 & 0xFFFF;
4684
4685            if del_mask == 0 {
4686                ri += 16;
4687                continue;
4688            }
4689
4690            let mut m = del_mask;
4691            while m != 0 {
4692                let bit = m.trailing_zeros() as usize;
4693                let abs_pos = ri + bit;
4694                if abs_pos > run_start {
4695                    iov.push(std::io::IoSlice::new(&data[run_start..abs_pos]));
4696                    if iov.len() >= MAX_IOV {
4697                        write_ioslices(writer, &iov)?;
4698                        iov.clear();
4699                    }
4700                }
4701                run_start = abs_pos + 1;
4702                m &= m - 1;
4703            }
4704
4705            ri += 16;
4706        }
4707
4708        while ri < len {
4709            let b = *data.get_unchecked(ri);
4710            if b >= lo && b <= hi {
4711                if ri > run_start {
4712                    iov.push(std::io::IoSlice::new(&data[run_start..ri]));
4713                    if iov.len() >= MAX_IOV {
4714                        write_ioslices(writer, &iov)?;
4715                        iov.clear();
4716                    }
4717                }
4718                run_start = ri + 1;
4719            }
4720            ri += 1;
4721        }
4722
4723        if run_start < len {
4724            iov.push(std::io::IoSlice::new(&data[run_start..]));
4725        }
4726        if !iov.is_empty() {
4727            write_ioslices(writer, &iov)?;
4728        }
4729        Ok(())
4730    }
4731}
4732
4733/// NEON zero-copy range delete for aarch64: scans 16 bytes at a time using
4734/// NEON unsigned comparison, creates bitmask via pairwise narrowing, then
4735/// iterates delete positions from the bitmask.
4736#[cfg(target_arch = "aarch64")]
4737#[target_feature(enable = "neon")]
4738unsafe fn delete_range_zerocopy_neon(
4739    data: &[u8],
4740    writer: &mut impl Write,
4741    lo: u8,
4742    hi: u8,
4743) -> io::Result<()> {
4744    use std::arch::aarch64::*;
4745
4746    unsafe {
4747        let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4748        let len = data.len();
4749        let mut run_start: usize = 0;
4750        let mut ri: usize = 0;
4751
4752        let lo_v = vdupq_n_u8(lo);
4753        let hi_v = vdupq_n_u8(hi);
4754        // Bit position mask for extracting bitmask from comparison results
4755        let bit_mask: [u8; 16] = [1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128];
4756        let bit_mask_v = vld1q_u8(bit_mask.as_ptr());
4757
4758        while ri + 16 <= len {
4759            let input = vld1q_u8(data.as_ptr().add(ri));
4760            // in_range = 0xFF where lo <= byte <= hi
4761            let ge_lo = vcgeq_u8(input, lo_v);
4762            let le_hi = vcleq_u8(input, hi_v);
4763            let in_range = vandq_u8(ge_lo, le_hi);
4764
4765            // Create 16-bit bitmask: reduce 16 bytes to 2 bytes
4766            let bits = vandq_u8(in_range, bit_mask_v);
4767            let pair = vpaddlq_u8(bits); // u8→u16 pairwise add
4768            let quad = vpaddlq_u16(pair); // u16→u32
4769            let octet = vpaddlq_u32(quad); // u32→u64
4770            let mask_lo = vgetq_lane_u64::<0>(octet) as u8;
4771            let mask_hi = vgetq_lane_u64::<1>(octet) as u8;
4772            let del_mask = (mask_hi as u16) << 8 | mask_lo as u16;
4773
4774            if del_mask == 0 {
4775                // No bytes to delete — run continues
4776                ri += 16;
4777                continue;
4778            }
4779
4780            // Process each deleted byte position
4781            let mut m = del_mask;
4782            while m != 0 {
4783                let bit = m.trailing_zeros() as usize;
4784                let abs_pos = ri + bit;
4785                if abs_pos > run_start {
4786                    iov.push(std::io::IoSlice::new(&data[run_start..abs_pos]));
4787                    if iov.len() >= MAX_IOV {
4788                        write_ioslices(writer, &iov)?;
4789                        iov.clear();
4790                    }
4791                }
4792                run_start = abs_pos + 1;
4793                m &= m - 1;
4794            }
4795
4796            ri += 16;
4797        }
4798
4799        // Scalar tail
4800        while ri < len {
4801            let b = *data.get_unchecked(ri);
4802            if b >= lo && b <= hi {
4803                if ri > run_start {
4804                    iov.push(std::io::IoSlice::new(&data[run_start..ri]));
4805                    if iov.len() >= MAX_IOV {
4806                        write_ioslices(writer, &iov)?;
4807                        iov.clear();
4808                    }
4809                }
4810                run_start = ri + 1;
4811            }
4812            ri += 1;
4813        }
4814
4815        if run_start < len {
4816            iov.push(std::io::IoSlice::new(&data[run_start..]));
4817        }
4818        if !iov.is_empty() {
4819            write_ioslices(writer, &iov)?;
4820        }
4821        Ok(())
4822    }
4823}
4824
4825/// Delete bytes from chunk using bitset, writing into pre-allocated buffer.
4826/// Returns number of bytes written.
4827#[inline]
4828fn delete_chunk_bitset_into(chunk: &[u8], member: &[u8; 32], outbuf: &mut [u8]) -> usize {
4829    let len = chunk.len();
4830    let mut out_pos = 0;
4831    let mut i = 0;
4832
4833    while i + 8 <= len {
4834        unsafe {
4835            let b0 = *chunk.get_unchecked(i);
4836            let b1 = *chunk.get_unchecked(i + 1);
4837            let b2 = *chunk.get_unchecked(i + 2);
4838            let b3 = *chunk.get_unchecked(i + 3);
4839            let b4 = *chunk.get_unchecked(i + 4);
4840            let b5 = *chunk.get_unchecked(i + 5);
4841            let b6 = *chunk.get_unchecked(i + 6);
4842            let b7 = *chunk.get_unchecked(i + 7);
4843
4844            *outbuf.get_unchecked_mut(out_pos) = b0;
4845            out_pos += !is_member(member, b0) as usize;
4846            *outbuf.get_unchecked_mut(out_pos) = b1;
4847            out_pos += !is_member(member, b1) as usize;
4848            *outbuf.get_unchecked_mut(out_pos) = b2;
4849            out_pos += !is_member(member, b2) as usize;
4850            *outbuf.get_unchecked_mut(out_pos) = b3;
4851            out_pos += !is_member(member, b3) as usize;
4852            *outbuf.get_unchecked_mut(out_pos) = b4;
4853            out_pos += !is_member(member, b4) as usize;
4854            *outbuf.get_unchecked_mut(out_pos) = b5;
4855            out_pos += !is_member(member, b5) as usize;
4856            *outbuf.get_unchecked_mut(out_pos) = b6;
4857            out_pos += !is_member(member, b6) as usize;
4858            *outbuf.get_unchecked_mut(out_pos) = b7;
4859            out_pos += !is_member(member, b7) as usize;
4860        }
4861        i += 8;
4862    }
4863
4864    while i < len {
4865        unsafe {
4866            let b = *chunk.get_unchecked(i);
4867            *outbuf.get_unchecked_mut(out_pos) = b;
4868            out_pos += !is_member(member, b) as usize;
4869        }
4870        i += 1;
4871    }
4872
4873    out_pos
4874}
4875
4876/// Zero-copy delete for general bitset: scan for runs of kept bytes,
4877/// build IoSlice entries pointing directly into the source data.
4878/// No allocation for output data — just ~16 bytes per IoSlice entry.
4879/// Flushes in MAX_IOV-sized batches for efficient writev.
4880fn delete_bitset_zerocopy(
4881    data: &[u8],
4882    member: &[u8; 32],
4883    writer: &mut impl Write,
4884) -> io::Result<()> {
4885    let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4886    let len = data.len();
4887    let mut i = 0;
4888    let mut run_start: Option<usize> = None;
4889
4890    while i < len {
4891        let b = unsafe { *data.get_unchecked(i) };
4892        if is_member(member, b) {
4893            // This byte should be deleted
4894            if let Some(rs) = run_start {
4895                iov.push(std::io::IoSlice::new(&data[rs..i]));
4896                run_start = None;
4897                if iov.len() >= MAX_IOV {
4898                    write_ioslices(writer, &iov)?;
4899                    iov.clear();
4900                }
4901            }
4902        } else {
4903            // This byte should be kept
4904            if run_start.is_none() {
4905                run_start = Some(i);
4906            }
4907        }
4908        i += 1;
4909    }
4910    // Flush final run
4911    if let Some(rs) = run_start {
4912        iov.push(std::io::IoSlice::new(&data[rs..]));
4913    }
4914    if !iov.is_empty() {
4915        write_ioslices(writer, &iov)?;
4916    }
4917    Ok(())
4918}
4919
4920fn delete_single_char_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4921    // Streaming zero-copy delete using writev: build IoSlice batches of MAX_IOV
4922    // pointing to gaps between deleted characters, write each batch immediately.
4923    // Avoids allocating the full Vec<IoSlice> for all positions.
4924    let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4925    let mut last = 0;
4926    for pos in memchr::memchr_iter(ch, data) {
4927        if pos > last {
4928            iov.push(std::io::IoSlice::new(&data[last..pos]));
4929            if iov.len() >= MAX_IOV {
4930                write_ioslices(writer, &iov)?;
4931                iov.clear();
4932            }
4933        }
4934        last = pos + 1;
4935    }
4936    if last < data.len() {
4937        iov.push(std::io::IoSlice::new(&data[last..]));
4938    }
4939    if !iov.is_empty() {
4940        write_ioslices(writer, &iov)?;
4941    }
4942    Ok(())
4943}
4944
4945fn delete_multi_memchr_mmap(chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4946    let c0 = chars[0];
4947    let c1 = if chars.len() >= 2 { chars[1] } else { 0 };
4948    let c2 = if chars.len() >= 3 { chars[2] } else { 0 };
4949    let is_three = chars.len() >= 3;
4950
4951    // Streaming zero-copy delete: batch IoSlice entries and write in groups of MAX_IOV.
4952    let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4953    let mut last = 0;
4954
4955    macro_rules! process_pos {
4956        ($pos:expr) => {
4957            if $pos > last {
4958                iov.push(std::io::IoSlice::new(&data[last..$pos]));
4959                if iov.len() >= MAX_IOV {
4960                    write_ioslices(writer, &iov)?;
4961                    iov.clear();
4962                }
4963            }
4964            last = $pos + 1;
4965        };
4966    }
4967
4968    if is_three {
4969        for pos in memchr::memchr3_iter(c0, c1, c2, data) {
4970            process_pos!(pos);
4971        }
4972    } else {
4973        for pos in memchr::memchr2_iter(c0, c1, data) {
4974            process_pos!(pos);
4975        }
4976    }
4977    if last < data.len() {
4978        iov.push(std::io::IoSlice::new(&data[last..]));
4979    }
4980    if !iov.is_empty() {
4981        write_ioslices(writer, &iov)?;
4982    }
4983    Ok(())
4984}
4985
4986/// Delete + squeeze from mmap'd byte slice.
4987///
4988/// For data <= 16MB: delete+squeeze into one buffer, one write syscall.
4989/// For data > 16MB: chunked approach to limit memory.
4990pub fn delete_squeeze_mmap(
4991    delete_chars: &[u8],
4992    squeeze_chars: &[u8],
4993    data: &[u8],
4994    writer: &mut impl Write,
4995) -> io::Result<()> {
4996    let delete_set = build_member_set(delete_chars);
4997    let squeeze_set = build_member_set(squeeze_chars);
4998
4999    // Single-allocation delete+squeeze: full-size buffer, single write_all.
5000    let mut outbuf = alloc_uninit_vec(data.len());
5001    let mut last_squeezed: u16 = 256;
5002    let mut out_pos = 0;
5003
5004    for &b in data.iter() {
5005        if is_member(&delete_set, b) {
5006            continue;
5007        }
5008        if is_member(&squeeze_set, b) {
5009            if last_squeezed == b as u16 {
5010                continue;
5011            }
5012            last_squeezed = b as u16;
5013        } else {
5014            last_squeezed = 256;
5015        }
5016        unsafe {
5017            *outbuf.get_unchecked_mut(out_pos) = b;
5018        }
5019        out_pos += 1;
5020    }
5021    writer.write_all(&outbuf[..out_pos])
5022}
5023
5024/// Squeeze from mmap'd byte slice.
5025///
5026/// For data >= 2MB: uses rayon parallel processing with boundary fixup.
5027/// For data <= 16MB: squeeze into one buffer, one write syscall.
5028/// For data > 16MB: chunked approach to limit memory.
5029pub fn squeeze_mmap(squeeze_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
5030    if squeeze_chars.len() == 1 {
5031        return squeeze_single_mmap(squeeze_chars[0], data, writer);
5032    }
5033    if squeeze_chars.len() == 2 {
5034        return squeeze_multi_mmap::<2>(squeeze_chars, data, writer);
5035    }
5036    if squeeze_chars.len() == 3 {
5037        return squeeze_multi_mmap::<3>(squeeze_chars, data, writer);
5038    }
5039
5040    let member = build_member_set(squeeze_chars);
5041
5042    // Parallel path: squeeze each chunk independently, then fix boundaries
5043    if data.len() >= PARALLEL_THRESHOLD {
5044        let n_threads = rayon::current_num_threads().max(1);
5045        let chunk_size = (data.len() / n_threads).max(32 * 1024);
5046
5047        let results: Vec<Vec<u8>> = data
5048            .par_chunks(chunk_size)
5049            .map(|chunk| squeeze_chunk_bitset(chunk, &member))
5050            .collect();
5051
5052        // Build IoSlice list, fixing boundaries: if chunk N ends with byte B
5053        // and chunk N+1 starts with same byte B, and B is in squeeze set,
5054        // skip the first byte(s) of chunk N+1 that equal B.
5055        // Collect slices for writev to minimize syscalls.
5056        let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
5057        for (idx, result) in results.iter().enumerate() {
5058            if result.is_empty() {
5059                continue;
5060            }
5061            if idx > 0 {
5062                // Check boundary: does previous chunk end with same squeezable byte?
5063                if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
5064                    if is_member(&member, prev_last) {
5065                        // Skip leading bytes in this chunk that equal prev_last
5066                        let skip = result.iter().take_while(|&&b| b == prev_last).count();
5067                        if skip < result.len() {
5068                            slices.push(std::io::IoSlice::new(&result[skip..]));
5069                        }
5070                        continue;
5071                    }
5072                }
5073            }
5074            slices.push(std::io::IoSlice::new(result));
5075        }
5076        return write_ioslices(writer, &slices);
5077    }
5078
5079    // Single-allocation squeeze: full-size buffer, single write_all.
5080    let mut outbuf = alloc_uninit_vec(data.len());
5081    let len = data.len();
5082    let mut wp = 0;
5083    let mut i = 0;
5084    let mut last_squeezed: u16 = 256;
5085
5086    unsafe {
5087        let inp = data.as_ptr();
5088        let outp = outbuf.as_mut_ptr();
5089
5090        while i < len {
5091            let b = *inp.add(i);
5092            if is_member(&member, b) {
5093                if last_squeezed != b as u16 {
5094                    *outp.add(wp) = b;
5095                    wp += 1;
5096                    last_squeezed = b as u16;
5097                }
5098                i += 1;
5099                while i < len && *inp.add(i) == b {
5100                    i += 1;
5101                }
5102            } else {
5103                last_squeezed = 256;
5104                *outp.add(wp) = b;
5105                wp += 1;
5106                i += 1;
5107            }
5108        }
5109    }
5110    writer.write_all(&outbuf[..wp])
5111}
5112
5113/// Squeeze a single chunk using bitset membership. Returns squeezed output.
5114fn squeeze_chunk_bitset(chunk: &[u8], member: &[u8; 32]) -> Vec<u8> {
5115    let len = chunk.len();
5116    let mut out = Vec::with_capacity(len);
5117    let mut last_squeezed: u16 = 256;
5118    let mut i = 0;
5119
5120    unsafe {
5121        out.set_len(len);
5122        let inp = chunk.as_ptr();
5123        let outp: *mut u8 = out.as_mut_ptr();
5124        let mut wp = 0;
5125
5126        while i < len {
5127            let b = *inp.add(i);
5128            if is_member(member, b) {
5129                if last_squeezed != b as u16 {
5130                    *outp.add(wp) = b;
5131                    wp += 1;
5132                    last_squeezed = b as u16;
5133                }
5134                i += 1;
5135                while i < len && *inp.add(i) == b {
5136                    i += 1;
5137                }
5138            } else {
5139                last_squeezed = 256;
5140                *outp.add(wp) = b;
5141                wp += 1;
5142                i += 1;
5143            }
5144        }
5145        out.set_len(wp);
5146    }
5147    out
5148}
5149
5150fn squeeze_multi_mmap<const N: usize>(
5151    chars: &[u8],
5152    data: &[u8],
5153    writer: &mut impl Write,
5154) -> io::Result<()> {
5155    // Parallel path for large data: squeeze each chunk, fix boundaries with writev
5156    if data.len() >= PARALLEL_THRESHOLD {
5157        let member = build_member_set(chars);
5158        let n_threads = rayon::current_num_threads().max(1);
5159        let chunk_size = (data.len() / n_threads).max(32 * 1024);
5160
5161        let results: Vec<Vec<u8>> = data
5162            .par_chunks(chunk_size)
5163            .map(|chunk| squeeze_chunk_bitset(chunk, &member))
5164            .collect();
5165
5166        // Build IoSlice list, fixing boundaries
5167        let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
5168        for (idx, result) in results.iter().enumerate() {
5169            if result.is_empty() {
5170                continue;
5171            }
5172            if idx > 0 {
5173                if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
5174                    if is_member(&member, prev_last) {
5175                        let skip = result.iter().take_while(|&&b| b == prev_last).count();
5176                        if skip < result.len() {
5177                            slices.push(std::io::IoSlice::new(&result[skip..]));
5178                        }
5179                        continue;
5180                    }
5181                }
5182            }
5183            slices.push(std::io::IoSlice::new(result));
5184        }
5185        return write_ioslices(writer, &slices);
5186    }
5187
5188    // Zero-copy writev: build IoSlice entries pointing directly into
5189    // the original mmap'd data, keeping one byte per run of squeezable chars.
5190    // Each IoSlice points at the gap between squeeze points (inclusive of
5191    // the first byte of a run) — no data is copied.
5192    let single = [chars[0]; 1]; // scratch for emitting single squeeze byte
5193    let _ = single;
5194    let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(1024);
5195    let mut cursor = 0;
5196    let mut last_squeezed: u16 = 256;
5197
5198    macro_rules! find_next {
5199        ($data:expr) => {
5200            if N == 2 {
5201                memchr::memchr2(chars[0], chars[1], $data)
5202            } else {
5203                memchr::memchr3(chars[0], chars[1], chars[2], $data)
5204            }
5205        };
5206    }
5207
5208    while cursor < data.len() {
5209        match find_next!(&data[cursor..]) {
5210            Some(offset) => {
5211                let pos = cursor + offset;
5212                let b = data[pos];
5213                // Emit gap before squeeze point
5214                if pos > cursor {
5215                    iov.push(std::io::IoSlice::new(&data[cursor..pos]));
5216                    last_squeezed = 256;
5217                }
5218                // Emit single byte if not duplicate
5219                if last_squeezed != b as u16 {
5220                    // Point at the byte in the original data (zero-copy)
5221                    iov.push(std::io::IoSlice::new(&data[pos..pos + 1]));
5222                    last_squeezed = b as u16;
5223                }
5224                // Skip the run of same byte
5225                let mut skip = pos + 1;
5226                while skip < data.len() && data[skip] == b {
5227                    skip += 1;
5228                }
5229                cursor = skip;
5230                // Flush when approaching MAX_IOV
5231                if iov.len() >= MAX_IOV {
5232                    write_ioslices(writer, &iov)?;
5233                    iov.clear();
5234                }
5235            }
5236            None => {
5237                if cursor < data.len() {
5238                    iov.push(std::io::IoSlice::new(&data[cursor..]));
5239                }
5240                break;
5241            }
5242        }
5243    }
5244    if !iov.is_empty() {
5245        write_ioslices(writer, &iov)?;
5246    }
5247    Ok(())
5248}
5249
5250fn squeeze_single_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
5251    if data.is_empty() {
5252        return Ok(());
5253    }
5254
5255    // Quick check: no consecutive pairs means no squeezing needed
5256    let pair = [ch, ch];
5257    if memchr::memmem::find(data, &pair).is_none() {
5258        return writer.write_all(data);
5259    }
5260
5261    // Zero-copy writev approach: build IoSlice entries pointing directly into
5262    // the original mmap'd data, skipping duplicate bytes in runs.
5263    // For `tr -s ' '` on 10MB with ~5K squeeze points:
5264    //   - ~10K IoSlice entries (one per gap + one per squeeze point)
5265    //   - ~10 writev syscalls (at 1024 entries per batch)
5266    //   - Zero data copy — kernel reads directly from mmap pages
5267    let finder = memchr::memmem::Finder::new(&pair);
5268    let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(2048);
5269    let mut cursor = 0;
5270
5271    while cursor < data.len() {
5272        match finder.find(&data[cursor..]) {
5273            Some(offset) => {
5274                let pair_pos = cursor + offset;
5275                // Include everything up to and including the first byte of the pair
5276                let seg_end = pair_pos + 1;
5277                if seg_end > cursor {
5278                    iov.push(std::io::IoSlice::new(&data[cursor..seg_end]));
5279                }
5280                // Skip all remaining consecutive ch bytes (the run)
5281                let mut skip = seg_end;
5282                while skip < data.len() && data[skip] == ch {
5283                    skip += 1;
5284                }
5285                cursor = skip;
5286                // Flush when approaching MAX_IOV
5287                if iov.len() >= MAX_IOV {
5288                    write_ioslices(writer, &iov)?;
5289                    iov.clear();
5290                }
5291            }
5292            None => {
5293                // No more pairs — emit remainder
5294                if cursor < data.len() {
5295                    iov.push(std::io::IoSlice::new(&data[cursor..]));
5296                }
5297                break;
5298            }
5299        }
5300    }
5301
5302    if !iov.is_empty() {
5303        write_ioslices(writer, &iov)?;
5304    }
5305    Ok(())
5306}