Skip to main content

coreutils_rs/tr/
core.rs

1use std::io::{self, Read, Write};
2
3use rayon::prelude::*;
4
5/// Maximum IoSlice entries per write_vectored batch.
6/// Linux UIO_MAXIOV is 1024; we use that as our batch limit.
7const MAX_IOV: usize = 1024;
8
9/// Main processing buffer: 4MB (fits in L3 cache, avoids cache thrashing).
10const BUF_SIZE: usize = 4 * 1024 * 1024;
11
12/// Stream buffer: 16MB — tr streaming operations (translate, delete, squeeze)
13/// are compute-light (single table lookup or bitset check per byte), so the
14/// bottleneck is I/O syscalls, not cache pressure. 16MB buffer means only
15/// 1 read()/write() syscall pair for a 10MB input, minimizing syscall overhead.
16/// For piped input, read_full retries partial reads to fill the entire buffer.
17/// This applies to ALL streaming modes (delete, squeeze, translate).
18const STREAM_BUF: usize = 16 * 1024 * 1024;
19
20/// Minimum data size to engage rayon parallel processing for mmap paths.
21/// AVX2 translation runs at ~10 GB/s per core, processing 10MB in ~1ms.
22/// Rayon thread pool creation + sync costs ~100-500us, which nearly
23/// negates the parallelism benefit for data under 16MB. Single-threaded
24/// AVX2 is faster for typical benchmark workloads (10MB).
25const PARALLEL_THRESHOLD: usize = 16 * 1024 * 1024;
26
27/// Write multiple IoSlice buffers using write_vectored, batching into MAX_IOV-sized groups.
28/// Falls back to write_all per slice for partial writes.
29#[inline]
30fn write_ioslices(writer: &mut impl Write, slices: &[std::io::IoSlice]) -> io::Result<()> {
31    if slices.is_empty() {
32        return Ok(());
33    }
34    for batch in slices.chunks(MAX_IOV) {
35        let total: usize = batch.iter().map(|s| s.len()).sum();
36        match writer.write_vectored(batch) {
37            Ok(n) if n >= total => continue,
38            Ok(mut written) => {
39                // Partial write: fall back to write_all per remaining slice
40                for slice in batch {
41                    let slen = slice.len();
42                    if written >= slen {
43                        written -= slen;
44                        continue;
45                    }
46                    if written > 0 {
47                        writer.write_all(&slice[written..])?;
48                        written = 0;
49                    } else {
50                        writer.write_all(slice)?;
51                    }
52                }
53            }
54            Err(e) => return Err(e),
55        }
56    }
57    Ok(())
58}
59
60/// Allocate a Vec<u8> of given length without zero-initialization.
61/// SAFETY: Caller must write all bytes before reading them.
62#[inline]
63#[allow(clippy::uninit_vec)]
64fn alloc_uninit_vec(len: usize) -> Vec<u8> {
65    let mut v = Vec::with_capacity(len);
66    // SAFETY: u8 has no drop, no invalid bit patterns; caller will overwrite before reading
67    unsafe {
68        v.set_len(len);
69    }
70    v
71}
72
73/// Build a 256-byte lookup table mapping set1[i] -> set2[i].
74#[inline]
75fn build_translate_table(set1: &[u8], set2: &[u8]) -> [u8; 256] {
76    let mut table: [u8; 256] = std::array::from_fn(|i| i as u8);
77    let last = set2.last().copied();
78    for (i, &from) in set1.iter().enumerate() {
79        table[from as usize] = if i < set2.len() {
80            set2[i]
81        } else {
82            last.unwrap_or(from)
83        };
84    }
85    table
86}
87
88/// Build a 256-bit (32-byte) membership set for O(1) byte lookup.
89#[inline]
90fn build_member_set(chars: &[u8]) -> [u8; 32] {
91    let mut set = [0u8; 32];
92    for &ch in chars {
93        set[ch as usize >> 3] |= 1 << (ch & 7);
94    }
95    set
96}
97
98#[inline(always)]
99fn is_member(set: &[u8; 32], ch: u8) -> bool {
100    unsafe { (*set.get_unchecked(ch as usize >> 3) & (1 << (ch & 7))) != 0 }
101}
102
103/// Cached SIMD capability level for x86_64.
104/// 0 = unchecked, 1 = scalar only, 2 = SSSE3, 3 = AVX2
105#[cfg(target_arch = "x86_64")]
106static SIMD_LEVEL: std::sync::atomic::AtomicU8 = std::sync::atomic::AtomicU8::new(0);
107
108#[cfg(target_arch = "x86_64")]
109#[inline(always)]
110fn get_simd_level() -> u8 {
111    let level = SIMD_LEVEL.load(std::sync::atomic::Ordering::Relaxed);
112    if level != 0 {
113        return level;
114    }
115    let detected = if is_x86_feature_detected!("avx2") {
116        3
117    } else if is_x86_feature_detected!("ssse3") {
118        2
119    } else {
120        1
121    };
122    SIMD_LEVEL.store(detected, std::sync::atomic::Ordering::Relaxed);
123    detected
124}
125
126/// Translate bytes in-place using a 256-byte lookup table.
127/// On x86_64 with SSSE3+, uses SIMD pshufb-based nibble decomposition for
128/// ~32 bytes per iteration. Falls back to 8x-unrolled scalar on other platforms.
129#[inline(always)]
130fn translate_inplace(data: &mut [u8], table: &[u8; 256]) {
131    #[cfg(target_arch = "x86_64")]
132    {
133        let level = get_simd_level();
134        if level >= 3 {
135            unsafe { translate_inplace_avx2_table(data, table) };
136            return;
137        }
138        if level >= 2 {
139            unsafe { translate_inplace_ssse3_table(data, table) };
140            return;
141        }
142    }
143    translate_inplace_scalar(data, table);
144}
145
146/// Scalar fallback: 8x-unrolled table lookup.
147#[inline(always)]
148fn translate_inplace_scalar(data: &mut [u8], table: &[u8; 256]) {
149    let len = data.len();
150    let ptr = data.as_mut_ptr();
151    let mut i = 0;
152    unsafe {
153        while i + 8 <= len {
154            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
155            *ptr.add(i + 1) = *table.get_unchecked(*ptr.add(i + 1) as usize);
156            *ptr.add(i + 2) = *table.get_unchecked(*ptr.add(i + 2) as usize);
157            *ptr.add(i + 3) = *table.get_unchecked(*ptr.add(i + 3) as usize);
158            *ptr.add(i + 4) = *table.get_unchecked(*ptr.add(i + 4) as usize);
159            *ptr.add(i + 5) = *table.get_unchecked(*ptr.add(i + 5) as usize);
160            *ptr.add(i + 6) = *table.get_unchecked(*ptr.add(i + 6) as usize);
161            *ptr.add(i + 7) = *table.get_unchecked(*ptr.add(i + 7) as usize);
162            i += 8;
163        }
164        while i < len {
165            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
166            i += 1;
167        }
168    }
169}
170
171// ============================================================================
172// SIMD arbitrary table lookup using pshufb nibble decomposition (x86_64)
173// ============================================================================
174//
175// For an arbitrary 256-byte lookup table, we decompose each byte into
176// high nibble (bits 7-4) and low nibble (bits 3-0). We pre-build 16
177// SIMD vectors, one for each high nibble value h (0..15), containing
178// the 16 table entries table[h*16+0..h*16+15]. Then for each input
179// vector we:
180//   1. Extract low nibble (AND 0x0F) -> used as pshufb index
181//   2. Extract high nibble (shift right 4) -> used to select which table
182//   3. For each of the 16 high nibble values, create a mask where
183//      the high nibble equals that value, pshufb the corresponding
184//      table, and accumulate results
185//
186// AVX2 processes 32 bytes/iteration; SSSE3 processes 16 bytes/iteration.
187// With instruction-level parallelism, this achieves much higher throughput
188// than scalar table lookups which have serial data dependencies.
189
190#[cfg(target_arch = "x86_64")]
191#[target_feature(enable = "avx2")]
192unsafe fn translate_inplace_avx2_table(data: &mut [u8], table: &[u8; 256]) {
193    use std::arch::x86_64::*;
194
195    unsafe {
196        let len = data.len();
197        let ptr = data.as_mut_ptr();
198
199        // Pre-build 16 lookup vectors, one per high nibble value.
200        // Each vector holds 32 bytes = 2x 128-bit lanes, each lane has the same
201        // 16 table entries for pshufb indexing by low nibble.
202        let mut lut = [_mm256_setzero_si256(); 16];
203        for h in 0u8..16 {
204            let base = (h as usize) * 16;
205            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
206            // Broadcast the 128-bit row to both lanes of the 256-bit vector
207            let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
208            lut[h as usize] = _mm256_broadcastsi128_si256(row128);
209        }
210
211        let lo_mask = _mm256_set1_epi8(0x0F);
212
213        let mut i = 0;
214        while i + 32 <= len {
215            let input = _mm256_loadu_si256(ptr.add(i) as *const _);
216            let lo_nibble = _mm256_and_si256(input, lo_mask);
217            let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
218
219            // Accumulate result: for each high nibble value h, select bytes where
220            // hi_nibble == h, look up in lut[h] using lo_nibble, blend into result.
221            let mut result = _mm256_setzero_si256();
222
223            // Unroll the 16 high-nibble iterations for best ILP
224            macro_rules! do_nibble {
225                ($h:expr) => {
226                    let h_val = _mm256_set1_epi8($h as i8);
227                    let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
228                    let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
229                    result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
230                };
231            }
232            do_nibble!(0);
233            do_nibble!(1);
234            do_nibble!(2);
235            do_nibble!(3);
236            do_nibble!(4);
237            do_nibble!(5);
238            do_nibble!(6);
239            do_nibble!(7);
240            do_nibble!(8);
241            do_nibble!(9);
242            do_nibble!(10);
243            do_nibble!(11);
244            do_nibble!(12);
245            do_nibble!(13);
246            do_nibble!(14);
247            do_nibble!(15);
248
249            _mm256_storeu_si256(ptr.add(i) as *mut _, result);
250            i += 32;
251        }
252
253        // SSE/SSSE3 tail for remaining 16-byte chunk
254        if i + 16 <= len {
255            let lo_mask128 = _mm_set1_epi8(0x0F);
256
257            // Build 128-bit LUTs (just the lower lane of each 256-bit LUT)
258            let mut lut128 = [_mm_setzero_si128(); 16];
259            for h in 0u8..16 {
260                lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
261            }
262
263            let input = _mm_loadu_si128(ptr.add(i) as *const _);
264            let lo_nib = _mm_and_si128(input, lo_mask128);
265            let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
266
267            let mut res = _mm_setzero_si128();
268            macro_rules! do_nibble128 {
269                ($h:expr) => {
270                    let h_val = _mm_set1_epi8($h as i8);
271                    let mask = _mm_cmpeq_epi8(hi_nib, h_val);
272                    let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
273                    res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
274                };
275            }
276            do_nibble128!(0);
277            do_nibble128!(1);
278            do_nibble128!(2);
279            do_nibble128!(3);
280            do_nibble128!(4);
281            do_nibble128!(5);
282            do_nibble128!(6);
283            do_nibble128!(7);
284            do_nibble128!(8);
285            do_nibble128!(9);
286            do_nibble128!(10);
287            do_nibble128!(11);
288            do_nibble128!(12);
289            do_nibble128!(13);
290            do_nibble128!(14);
291            do_nibble128!(15);
292
293            _mm_storeu_si128(ptr.add(i) as *mut _, res);
294            i += 16;
295        }
296
297        // Scalar tail
298        while i < len {
299            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
300            i += 1;
301        }
302    }
303}
304
305#[cfg(target_arch = "x86_64")]
306#[target_feature(enable = "ssse3")]
307unsafe fn translate_inplace_ssse3_table(data: &mut [u8], table: &[u8; 256]) {
308    use std::arch::x86_64::*;
309
310    unsafe {
311        let len = data.len();
312        let ptr = data.as_mut_ptr();
313
314        // Pre-build 16 lookup vectors for pshufb
315        let mut lut = [_mm_setzero_si128(); 16];
316        for h in 0u8..16 {
317            let base = (h as usize) * 16;
318            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
319            lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
320        }
321
322        let lo_mask = _mm_set1_epi8(0x0F);
323
324        let mut i = 0;
325        while i + 16 <= len {
326            let input = _mm_loadu_si128(ptr.add(i) as *const _);
327            let lo_nibble = _mm_and_si128(input, lo_mask);
328            let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
329
330            let mut result = _mm_setzero_si128();
331
332            macro_rules! do_nibble {
333                ($h:expr) => {
334                    let h_val = _mm_set1_epi8($h as i8);
335                    let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
336                    let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
337                    result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
338                };
339            }
340            do_nibble!(0);
341            do_nibble!(1);
342            do_nibble!(2);
343            do_nibble!(3);
344            do_nibble!(4);
345            do_nibble!(5);
346            do_nibble!(6);
347            do_nibble!(7);
348            do_nibble!(8);
349            do_nibble!(9);
350            do_nibble!(10);
351            do_nibble!(11);
352            do_nibble!(12);
353            do_nibble!(13);
354            do_nibble!(14);
355            do_nibble!(15);
356
357            _mm_storeu_si128(ptr.add(i) as *mut _, result);
358            i += 16;
359        }
360
361        // Scalar tail
362        while i < len {
363            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
364            i += 1;
365        }
366    }
367}
368
369/// Translate bytes from source to destination using a 256-byte lookup table.
370/// On x86_64 with SSSE3+, uses SIMD pshufb-based nibble decomposition.
371#[inline(always)]
372fn translate_to(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
373    debug_assert!(dst.len() >= src.len());
374    #[cfg(target_arch = "x86_64")]
375    {
376        let level = get_simd_level();
377        if level >= 3 {
378            unsafe { translate_to_avx2_table(src, dst, table) };
379            return;
380        }
381        if level >= 2 {
382            unsafe { translate_to_ssse3_table(src, dst, table) };
383            return;
384        }
385    }
386    translate_to_scalar(src, dst, table);
387}
388
389/// Scalar fallback for translate_to.
390#[inline(always)]
391fn translate_to_scalar(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
392    unsafe {
393        let sp = src.as_ptr();
394        let dp = dst.as_mut_ptr();
395        let len = src.len();
396        let mut i = 0;
397        while i + 8 <= len {
398            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
399            *dp.add(i + 1) = *table.get_unchecked(*sp.add(i + 1) as usize);
400            *dp.add(i + 2) = *table.get_unchecked(*sp.add(i + 2) as usize);
401            *dp.add(i + 3) = *table.get_unchecked(*sp.add(i + 3) as usize);
402            *dp.add(i + 4) = *table.get_unchecked(*sp.add(i + 4) as usize);
403            *dp.add(i + 5) = *table.get_unchecked(*sp.add(i + 5) as usize);
404            *dp.add(i + 6) = *table.get_unchecked(*sp.add(i + 6) as usize);
405            *dp.add(i + 7) = *table.get_unchecked(*sp.add(i + 7) as usize);
406            i += 8;
407        }
408        while i < len {
409            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
410            i += 1;
411        }
412    }
413}
414
415#[cfg(target_arch = "x86_64")]
416#[target_feature(enable = "avx2")]
417unsafe fn translate_to_avx2_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
418    use std::arch::x86_64::*;
419
420    unsafe {
421        let len = src.len();
422        let sp = src.as_ptr();
423        let dp = dst.as_mut_ptr();
424
425        // Pre-build 16 lookup vectors
426        let mut lut = [_mm256_setzero_si256(); 16];
427        for h in 0u8..16 {
428            let base = (h as usize) * 16;
429            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
430            let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
431            lut[h as usize] = _mm256_broadcastsi128_si256(row128);
432        }
433
434        let lo_mask = _mm256_set1_epi8(0x0F);
435
436        let mut i = 0;
437        while i + 32 <= len {
438            let input = _mm256_loadu_si256(sp.add(i) as *const _);
439            let lo_nibble = _mm256_and_si256(input, lo_mask);
440            let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
441
442            let mut result = _mm256_setzero_si256();
443
444            macro_rules! do_nibble {
445                ($h:expr) => {
446                    let h_val = _mm256_set1_epi8($h as i8);
447                    let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
448                    let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
449                    result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
450                };
451            }
452            do_nibble!(0);
453            do_nibble!(1);
454            do_nibble!(2);
455            do_nibble!(3);
456            do_nibble!(4);
457            do_nibble!(5);
458            do_nibble!(6);
459            do_nibble!(7);
460            do_nibble!(8);
461            do_nibble!(9);
462            do_nibble!(10);
463            do_nibble!(11);
464            do_nibble!(12);
465            do_nibble!(13);
466            do_nibble!(14);
467            do_nibble!(15);
468
469            _mm256_storeu_si256(dp.add(i) as *mut _, result);
470            i += 32;
471        }
472
473        // SSSE3 tail for remaining 16-byte chunk
474        if i + 16 <= len {
475            let lo_mask128 = _mm_set1_epi8(0x0F);
476            let mut lut128 = [_mm_setzero_si128(); 16];
477            for h in 0u8..16 {
478                lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
479            }
480
481            let input = _mm_loadu_si128(sp.add(i) as *const _);
482            let lo_nib = _mm_and_si128(input, lo_mask128);
483            let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
484
485            let mut res = _mm_setzero_si128();
486            macro_rules! do_nibble128 {
487                ($h:expr) => {
488                    let h_val = _mm_set1_epi8($h as i8);
489                    let mask = _mm_cmpeq_epi8(hi_nib, h_val);
490                    let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
491                    res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
492                };
493            }
494            do_nibble128!(0);
495            do_nibble128!(1);
496            do_nibble128!(2);
497            do_nibble128!(3);
498            do_nibble128!(4);
499            do_nibble128!(5);
500            do_nibble128!(6);
501            do_nibble128!(7);
502            do_nibble128!(8);
503            do_nibble128!(9);
504            do_nibble128!(10);
505            do_nibble128!(11);
506            do_nibble128!(12);
507            do_nibble128!(13);
508            do_nibble128!(14);
509            do_nibble128!(15);
510
511            _mm_storeu_si128(dp.add(i) as *mut _, res);
512            i += 16;
513        }
514
515        // Scalar tail
516        while i < len {
517            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
518            i += 1;
519        }
520    }
521}
522
523#[cfg(target_arch = "x86_64")]
524#[target_feature(enable = "ssse3")]
525unsafe fn translate_to_ssse3_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
526    use std::arch::x86_64::*;
527
528    unsafe {
529        let len = src.len();
530        let sp = src.as_ptr();
531        let dp = dst.as_mut_ptr();
532
533        let mut lut = [_mm_setzero_si128(); 16];
534        for h in 0u8..16 {
535            let base = (h as usize) * 16;
536            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
537            lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
538        }
539
540        let lo_mask = _mm_set1_epi8(0x0F);
541
542        let mut i = 0;
543        while i + 16 <= len {
544            let input = _mm_loadu_si128(sp.add(i) as *const _);
545            let lo_nibble = _mm_and_si128(input, lo_mask);
546            let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
547
548            let mut result = _mm_setzero_si128();
549
550            macro_rules! do_nibble {
551                ($h:expr) => {
552                    let h_val = _mm_set1_epi8($h as i8);
553                    let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
554                    let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
555                    result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
556                };
557            }
558            do_nibble!(0);
559            do_nibble!(1);
560            do_nibble!(2);
561            do_nibble!(3);
562            do_nibble!(4);
563            do_nibble!(5);
564            do_nibble!(6);
565            do_nibble!(7);
566            do_nibble!(8);
567            do_nibble!(9);
568            do_nibble!(10);
569            do_nibble!(11);
570            do_nibble!(12);
571            do_nibble!(13);
572            do_nibble!(14);
573            do_nibble!(15);
574
575            _mm_storeu_si128(dp.add(i) as *mut _, result);
576            i += 16;
577        }
578
579        // Scalar tail
580        while i < len {
581            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
582            i += 1;
583        }
584    }
585}
586
587// ============================================================================
588// SIMD range translation (x86_64)
589// ============================================================================
590
591/// Detect if the translate table is a single contiguous range with constant offset.
592/// Returns Some((lo, hi, offset)) if all non-identity entries form [lo..=hi] with
593/// table[i] = i + offset for all i in [lo, hi].
594#[inline]
595fn detect_range_offset(table: &[u8; 256]) -> Option<(u8, u8, i8)> {
596    let mut lo: Option<u8> = None;
597    let mut hi = 0u8;
598    let mut offset = 0i16;
599
600    for i in 0..256 {
601        if table[i] != i as u8 {
602            let diff = table[i] as i16 - i as i16;
603            match lo {
604                None => {
605                    lo = Some(i as u8);
606                    hi = i as u8;
607                    offset = diff;
608                }
609                Some(_) => {
610                    if diff != offset || i as u8 != hi.wrapping_add(1) {
611                        return None;
612                    }
613                    hi = i as u8;
614                }
615            }
616        }
617    }
618
619    lo.map(|l| (l, hi, offset as i8))
620}
621
622/// Detect if the translate table maps a contiguous range [lo..=hi] to a single constant byte,
623/// and all other bytes are identity. This covers cases like `tr '\000-\037' 'X'` where
624/// a range maps to one replacement character.
625/// Returns Some((lo, hi, replacement)) if the pattern matches.
626#[inline]
627fn detect_range_to_constant(table: &[u8; 256]) -> Option<(u8, u8, u8)> {
628    let mut lo: Option<u8> = None;
629    let mut hi = 0u8;
630    let mut replacement = 0u8;
631
632    for i in 0..256 {
633        if table[i] != i as u8 {
634            match lo {
635                None => {
636                    lo = Some(i as u8);
637                    hi = i as u8;
638                    replacement = table[i];
639                }
640                Some(_) => {
641                    if table[i] != replacement || i as u8 != hi.wrapping_add(1) {
642                        return None;
643                    }
644                    hi = i as u8;
645                }
646            }
647        }
648    }
649
650    lo.map(|l| (l, hi, replacement))
651}
652
653/// SIMD-accelerated range-to-constant translation.
654/// For tables where a contiguous range [lo..=hi] maps to a single byte, and all
655/// other bytes are identity. Uses vectorized range check + blend (5 SIMD ops per
656/// 32 bytes with AVX2, vs 48 for general nibble decomposition).
657#[cfg(target_arch = "x86_64")]
658fn translate_range_to_constant_simd_inplace(data: &mut [u8], lo: u8, hi: u8, replacement: u8) {
659    if get_simd_level() >= 3 {
660        unsafe { translate_range_to_constant_avx2_inplace(data, lo, hi, replacement) };
661    } else {
662        unsafe { translate_range_to_constant_sse2_inplace(data, lo, hi, replacement) };
663    }
664}
665
666#[cfg(target_arch = "x86_64")]
667#[target_feature(enable = "avx2")]
668unsafe fn translate_range_to_constant_avx2_inplace(
669    data: &mut [u8],
670    lo: u8,
671    hi: u8,
672    replacement: u8,
673) {
674    use std::arch::x86_64::*;
675
676    unsafe {
677        let range = hi - lo;
678        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
679        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
680        let repl_v = _mm256_set1_epi8(replacement as i8);
681        let zero = _mm256_setzero_si256();
682
683        let len = data.len();
684        let ptr = data.as_mut_ptr();
685        let mut i = 0;
686
687        while i + 32 <= len {
688            let input = _mm256_loadu_si256(ptr.add(i) as *const _);
689            let biased = _mm256_add_epi8(input, bias_v);
690            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
691            // mask = 0xFF where IN range (to replace), 0 where to KEEP
692            let in_range = _mm256_cmpeq_epi8(gt, zero);
693            // Use blendvb: for each byte, if in_range bit is set, use replacement, else use input
694            let result = _mm256_blendv_epi8(input, repl_v, in_range);
695            _mm256_storeu_si256(ptr.add(i) as *mut _, result);
696            i += 32;
697        }
698
699        if i + 16 <= len {
700            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
701            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
702            let repl_v128 = _mm_set1_epi8(replacement as i8);
703            let zero128 = _mm_setzero_si128();
704
705            let input = _mm_loadu_si128(ptr.add(i) as *const _);
706            let biased = _mm_add_epi8(input, bias_v128);
707            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
708            let in_range = _mm_cmpeq_epi8(gt, zero128);
709            let result = _mm_blendv_epi8(input, repl_v128, in_range);
710            _mm_storeu_si128(ptr.add(i) as *mut _, result);
711            i += 16;
712        }
713
714        while i < len {
715            let b = *ptr.add(i);
716            *ptr.add(i) = if b >= lo && b <= hi { replacement } else { b };
717            i += 1;
718        }
719    }
720}
721
722#[cfg(target_arch = "x86_64")]
723#[target_feature(enable = "sse2")]
724unsafe fn translate_range_to_constant_sse2_inplace(
725    data: &mut [u8],
726    lo: u8,
727    hi: u8,
728    replacement: u8,
729) {
730    use std::arch::x86_64::*;
731
732    unsafe {
733        let range = hi - lo;
734        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
735        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
736        let repl_v = _mm_set1_epi8(replacement as i8);
737        let zero = _mm_setzero_si128();
738
739        let len = data.len();
740        let ptr = data.as_mut_ptr();
741        let mut i = 0;
742
743        while i + 16 <= len {
744            let input = _mm_loadu_si128(ptr.add(i) as *const _);
745            let biased = _mm_add_epi8(input, bias_v);
746            let gt = _mm_cmpgt_epi8(biased, threshold_v);
747            // in_range mask: 0xFF where in range, 0x00 where not
748            let in_range = _mm_cmpeq_epi8(gt, zero);
749            // SSE2 blendv: (repl & mask) | (input & ~mask)
750            let result = _mm_or_si128(
751                _mm_and_si128(in_range, repl_v),
752                _mm_andnot_si128(in_range, input),
753            );
754            _mm_storeu_si128(ptr.add(i) as *mut _, result);
755            i += 16;
756        }
757
758        while i < len {
759            let b = *ptr.add(i);
760            *ptr.add(i) = if b >= lo && b <= hi { replacement } else { b };
761            i += 1;
762        }
763    }
764}
765
766#[cfg(not(target_arch = "x86_64"))]
767fn translate_range_to_constant_simd_inplace(data: &mut [u8], lo: u8, hi: u8, replacement: u8) {
768    for b in data.iter_mut() {
769        if *b >= lo && *b <= hi {
770            *b = replacement;
771        }
772    }
773}
774
775/// SIMD-accelerated range translation for mmap'd data.
776/// For tables where only a contiguous range [lo..=hi] is translated by a constant offset,
777/// uses AVX2 (32 bytes/iter) or SSE2 (16 bytes/iter) vectorized arithmetic.
778#[cfg(target_arch = "x86_64")]
779fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
780    if get_simd_level() >= 3 {
781        unsafe { translate_range_avx2(src, dst, lo, hi, offset) };
782    } else {
783        unsafe { translate_range_sse2(src, dst, lo, hi, offset) };
784    }
785}
786
787#[cfg(target_arch = "x86_64")]
788#[target_feature(enable = "avx2")]
789unsafe fn translate_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
790    use std::arch::x86_64::*;
791
792    unsafe {
793        let range = hi - lo;
794        // Bias: shift range so lo maps to -128 (signed min).
795        // For input in [lo, hi]: biased = input + (0x80 - lo) is in [-128, -128+range].
796        // For input < lo: biased wraps to large positive (signed), > threshold.
797        // For input > hi: biased > -128+range, > threshold.
798        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
799        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
800        let offset_v = _mm256_set1_epi8(offset);
801        let zero = _mm256_setzero_si256();
802
803        let len = src.len();
804        let mut i = 0;
805
806        while i + 32 <= len {
807            let input = _mm256_loadu_si256(src.as_ptr().add(i) as *const _);
808            let biased = _mm256_add_epi8(input, bias_v);
809            // gt = 0xFF where biased > threshold (OUT of range)
810            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
811            // mask = 0xFF where IN range (NOT gt)
812            let mask = _mm256_cmpeq_epi8(gt, zero);
813            let offset_masked = _mm256_and_si256(mask, offset_v);
814            let result = _mm256_add_epi8(input, offset_masked);
815            _mm256_storeu_si256(dst.as_mut_ptr().add(i) as *mut _, result);
816            i += 32;
817        }
818
819        // SSE2 tail for 16-byte remainder
820        if i + 16 <= len {
821            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
822            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
823            let offset_v128 = _mm_set1_epi8(offset);
824            let zero128 = _mm_setzero_si128();
825
826            let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
827            let biased = _mm_add_epi8(input, bias_v128);
828            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
829            let mask = _mm_cmpeq_epi8(gt, zero128);
830            let offset_masked = _mm_and_si128(mask, offset_v128);
831            let result = _mm_add_epi8(input, offset_masked);
832            _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
833            i += 16;
834        }
835
836        // Scalar tail
837        while i < len {
838            let b = *src.get_unchecked(i);
839            *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
840                b.wrapping_add(offset as u8)
841            } else {
842                b
843            };
844            i += 1;
845        }
846    }
847}
848
849#[cfg(target_arch = "x86_64")]
850#[target_feature(enable = "sse2")]
851unsafe fn translate_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
852    use std::arch::x86_64::*;
853
854    unsafe {
855        let range = hi - lo;
856        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
857        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
858        let offset_v = _mm_set1_epi8(offset);
859        let zero = _mm_setzero_si128();
860
861        let len = src.len();
862        let mut i = 0;
863
864        while i + 16 <= len {
865            let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
866            let biased = _mm_add_epi8(input, bias_v);
867            let gt = _mm_cmpgt_epi8(biased, threshold_v);
868            let mask = _mm_cmpeq_epi8(gt, zero);
869            let offset_masked = _mm_and_si128(mask, offset_v);
870            let result = _mm_add_epi8(input, offset_masked);
871            _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
872            i += 16;
873        }
874
875        while i < len {
876            let b = *src.get_unchecked(i);
877            *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
878                b.wrapping_add(offset as u8)
879            } else {
880                b
881            };
882            i += 1;
883        }
884    }
885}
886
887/// Scalar range translation fallback for non-x86_64.
888#[cfg(not(target_arch = "x86_64"))]
889fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
890    for (i, &b) in src.iter().enumerate() {
891        dst[i] = if b >= lo && b <= hi {
892            b.wrapping_add(offset as u8)
893        } else {
894            b
895        };
896    }
897}
898
899// ============================================================================
900// In-place SIMD range translation (saves one buffer allocation in streaming)
901// ============================================================================
902
903/// In-place SIMD-accelerated range translation.
904/// Translates bytes in [lo..=hi] by adding `offset`, leaving others unchanged.
905/// Operates on the buffer in-place, eliminating the need for a separate output buffer.
906#[cfg(target_arch = "x86_64")]
907fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
908    if get_simd_level() >= 3 {
909        unsafe { translate_range_avx2_inplace(data, lo, hi, offset) };
910    } else {
911        unsafe { translate_range_sse2_inplace(data, lo, hi, offset) };
912    }
913}
914
915#[cfg(target_arch = "x86_64")]
916#[target_feature(enable = "avx2")]
917unsafe fn translate_range_avx2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
918    use std::arch::x86_64::*;
919
920    unsafe {
921        let range = hi - lo;
922        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
923        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
924        let offset_v = _mm256_set1_epi8(offset);
925        let zero = _mm256_setzero_si256();
926
927        let len = data.len();
928        let ptr = data.as_mut_ptr();
929        let mut i = 0;
930
931        while i + 32 <= len {
932            let input = _mm256_loadu_si256(ptr.add(i) as *const _);
933            let biased = _mm256_add_epi8(input, bias_v);
934            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
935            let mask = _mm256_cmpeq_epi8(gt, zero);
936            let offset_masked = _mm256_and_si256(mask, offset_v);
937            let result = _mm256_add_epi8(input, offset_masked);
938            _mm256_storeu_si256(ptr.add(i) as *mut _, result);
939            i += 32;
940        }
941
942        if i + 16 <= len {
943            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
944            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
945            let offset_v128 = _mm_set1_epi8(offset);
946            let zero128 = _mm_setzero_si128();
947
948            let input = _mm_loadu_si128(ptr.add(i) as *const _);
949            let biased = _mm_add_epi8(input, bias_v128);
950            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
951            let mask = _mm_cmpeq_epi8(gt, zero128);
952            let offset_masked = _mm_and_si128(mask, offset_v128);
953            let result = _mm_add_epi8(input, offset_masked);
954            _mm_storeu_si128(ptr.add(i) as *mut _, result);
955            i += 16;
956        }
957
958        while i < len {
959            let b = *ptr.add(i);
960            *ptr.add(i) = if b >= lo && b <= hi {
961                b.wrapping_add(offset as u8)
962            } else {
963                b
964            };
965            i += 1;
966        }
967    }
968}
969
970#[cfg(target_arch = "x86_64")]
971#[target_feature(enable = "sse2")]
972unsafe fn translate_range_sse2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
973    use std::arch::x86_64::*;
974
975    unsafe {
976        let range = hi - lo;
977        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
978        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
979        let offset_v = _mm_set1_epi8(offset);
980        let zero = _mm_setzero_si128();
981
982        let len = data.len();
983        let ptr = data.as_mut_ptr();
984        let mut i = 0;
985
986        while i + 16 <= len {
987            let input = _mm_loadu_si128(ptr.add(i) as *const _);
988            let biased = _mm_add_epi8(input, bias_v);
989            let gt = _mm_cmpgt_epi8(biased, threshold_v);
990            let mask = _mm_cmpeq_epi8(gt, zero);
991            let offset_masked = _mm_and_si128(mask, offset_v);
992            let result = _mm_add_epi8(input, offset_masked);
993            _mm_storeu_si128(ptr.add(i) as *mut _, result);
994            i += 16;
995        }
996
997        while i < len {
998            let b = *ptr.add(i);
999            *ptr.add(i) = if b >= lo && b <= hi {
1000                b.wrapping_add(offset as u8)
1001            } else {
1002                b
1003            };
1004            i += 1;
1005        }
1006    }
1007}
1008
1009#[cfg(not(target_arch = "x86_64"))]
1010fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1011    for b in data.iter_mut() {
1012        if *b >= lo && *b <= hi {
1013            *b = b.wrapping_add(offset as u8);
1014        }
1015    }
1016}
1017
1018// ============================================================================
1019// SIMD range deletion (x86_64)
1020// ============================================================================
1021
1022/// Detect if ALL delete characters form a single contiguous byte range [lo..=hi].
1023/// Returns Some((lo, hi)) if so. This is true for common classes:
1024/// - `[:digit:]` = 0x30..=0x39
1025/// - `a-z` = 0x61..=0x7A
1026/// - `A-Z` = 0x41..=0x5A
1027#[inline]
1028fn detect_delete_range(chars: &[u8]) -> Option<(u8, u8)> {
1029    if chars.is_empty() {
1030        return None;
1031    }
1032    let mut lo = chars[0];
1033    let mut hi = chars[0];
1034    for &c in &chars[1..] {
1035        if c < lo {
1036            lo = c;
1037        }
1038        if c > hi {
1039            hi = c;
1040        }
1041    }
1042    // Check that the range size matches the number of chars (no gaps)
1043    if (hi - lo + 1) as usize == chars.len() {
1044        Some((lo, hi))
1045    } else {
1046        None
1047    }
1048}
1049
1050/// SIMD-accelerated delete for contiguous byte ranges.
1051/// Uses the same bias+threshold trick as range translate to identify bytes in [lo..=hi],
1052/// then compacts output by skipping matched bytes.
1053#[cfg(target_arch = "x86_64")]
1054fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
1055    if get_simd_level() >= 3 {
1056        unsafe { delete_range_avx2(src, dst, lo, hi) }
1057    } else {
1058        unsafe { delete_range_sse2(src, dst, lo, hi) }
1059    }
1060}
1061
1062#[cfg(target_arch = "x86_64")]
1063#[target_feature(enable = "avx2")]
1064unsafe fn delete_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
1065    use std::arch::x86_64::*;
1066
1067    unsafe {
1068        let range = hi - lo;
1069        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1070        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1071        let zero = _mm256_setzero_si256();
1072
1073        let len = src.len();
1074        let sp = src.as_ptr();
1075        let dp = dst.as_mut_ptr();
1076        let mut ri = 0;
1077        let mut wp = 0;
1078
1079        while ri + 32 <= len {
1080            let input = _mm256_loadu_si256(sp.add(ri) as *const _);
1081            let biased = _mm256_add_epi8(input, bias_v);
1082            // gt = 0xFF where biased > threshold (OUT of range = KEEP)
1083            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1084            // in_range = 0xFF where IN range (to DELETE), 0 where to KEEP
1085            let in_range = _mm256_cmpeq_epi8(gt, zero);
1086            // keep_mask bits: 1 = keep (NOT in range)
1087            let keep_mask = !(_mm256_movemask_epi8(in_range) as u32);
1088
1089            if keep_mask == 0xFFFFFFFF {
1090                // All 32 bytes are kept — bulk copy
1091                std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 32);
1092                wp += 32;
1093            } else if keep_mask != 0 {
1094                // Partial keep — process each 8-byte lane with popcnt
1095                compact_8bytes(sp.add(ri), dp.add(wp), keep_mask as u8);
1096                let c0 = (keep_mask as u8).count_ones() as usize;
1097                compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), (keep_mask >> 8) as u8);
1098                let c1 = ((keep_mask >> 8) as u8).count_ones() as usize;
1099                compact_8bytes(
1100                    sp.add(ri + 16),
1101                    dp.add(wp + c0 + c1),
1102                    (keep_mask >> 16) as u8,
1103                );
1104                let c2 = ((keep_mask >> 16) as u8).count_ones() as usize;
1105                compact_8bytes(
1106                    sp.add(ri + 24),
1107                    dp.add(wp + c0 + c1 + c2),
1108                    (keep_mask >> 24) as u8,
1109                );
1110                let c3 = ((keep_mask >> 24) as u8).count_ones() as usize;
1111                wp += c0 + c1 + c2 + c3;
1112            }
1113            // else: keep_mask == 0 means all bytes deleted, skip entirely
1114            ri += 32;
1115        }
1116
1117        // SSE2 tail for 16-byte remainder
1118        if ri + 16 <= len {
1119            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1120            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1121            let zero128 = _mm_setzero_si128();
1122
1123            let input = _mm_loadu_si128(sp.add(ri) as *const _);
1124            let biased = _mm_add_epi8(input, bias_v128);
1125            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1126            let in_range = _mm_cmpeq_epi8(gt, zero128);
1127            let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
1128
1129            if keep_mask == 0xFFFF {
1130                std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 16);
1131                wp += 16;
1132            } else if keep_mask != 0 {
1133                compact_8bytes(sp.add(ri), dp.add(wp), keep_mask as u8);
1134                let c0 = (keep_mask as u8).count_ones() as usize;
1135                compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), (keep_mask >> 8) as u8);
1136                wp += c0 + ((keep_mask >> 8) as u8).count_ones() as usize;
1137            }
1138            ri += 16;
1139        }
1140
1141        // Scalar tail
1142        while ri < len {
1143            let b = *sp.add(ri);
1144            if b < lo || b > hi {
1145                *dp.add(wp) = b;
1146                wp += 1;
1147            }
1148            ri += 1;
1149        }
1150
1151        wp
1152    }
1153}
1154
1155/// Compact 8 source bytes into contiguous output bytes using a keep mask.
1156/// Each bit in `mask` indicates whether the corresponding byte should be kept.
1157/// Uses branchless writes: always writes 8 bytes (the extras are harmless since
1158/// the caller tracks the write pointer by popcount).
1159#[cfg(target_arch = "x86_64")]
1160#[inline(always)]
1161unsafe fn compact_8bytes(src: *const u8, dst: *mut u8, mask: u8) {
1162    // For each set bit position in the mask, copy that byte from src to the
1163    // next output position. Using a tight trailing_zeros loop is faster than
1164    // a lookup table for 8-bit masks because it's branch-predicted well
1165    // and avoids cache misses on the table.
1166    unsafe {
1167        let mut m = mask;
1168        let mut w = 0;
1169        while m != 0 {
1170            let bit = m.trailing_zeros() as usize;
1171            *dst.add(w) = *src.add(bit);
1172            w += 1;
1173            m &= m - 1;
1174        }
1175    }
1176}
1177
1178#[cfg(target_arch = "x86_64")]
1179#[target_feature(enable = "sse2")]
1180unsafe fn delete_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
1181    use std::arch::x86_64::*;
1182
1183    unsafe {
1184        let range = hi - lo;
1185        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1186        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1187        let zero = _mm_setzero_si128();
1188
1189        let len = src.len();
1190        let sp = src.as_ptr();
1191        let dp = dst.as_mut_ptr();
1192        let mut ri = 0;
1193        let mut wp = 0;
1194
1195        while ri + 16 <= len {
1196            let input = _mm_loadu_si128(sp.add(ri) as *const _);
1197            let biased = _mm_add_epi8(input, bias_v);
1198            let gt = _mm_cmpgt_epi8(biased, threshold_v);
1199            let in_range = _mm_cmpeq_epi8(gt, zero);
1200            let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
1201
1202            if keep_mask == 0xFFFF {
1203                // All 16 bytes kept — bulk copy
1204                std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 16);
1205                wp += 16;
1206            } else if keep_mask != 0 {
1207                compact_8bytes(sp.add(ri), dp.add(wp), keep_mask as u8);
1208                let c0 = (keep_mask as u8).count_ones() as usize;
1209                compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), (keep_mask >> 8) as u8);
1210                wp += c0 + ((keep_mask >> 8) as u8).count_ones() as usize;
1211            }
1212            ri += 16;
1213        }
1214
1215        while ri < len {
1216            let b = *sp.add(ri);
1217            if b < lo || b > hi {
1218                *dp.add(wp) = b;
1219                wp += 1;
1220            }
1221            ri += 1;
1222        }
1223
1224        wp
1225    }
1226}
1227
1228/// Scalar range delete fallback for non-x86_64.
1229#[cfg(not(target_arch = "x86_64"))]
1230fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
1231    let mut wp = 0;
1232    for &b in src {
1233        if b < lo || b > hi {
1234            dst[wp] = b;
1235            wp += 1;
1236        }
1237    }
1238    wp
1239}
1240
1241/// Streaming delete for contiguous byte ranges using SIMD range detection.
1242/// Uses 4MB buffer to reduce syscalls (delete is compute-light, I/O bound).
1243/// When no bytes are deleted from a chunk (common for data with few matches),
1244/// writes directly from the source buffer to avoid the copy overhead.
1245fn delete_range_streaming(
1246    lo: u8,
1247    hi: u8,
1248    reader: &mut impl Read,
1249    writer: &mut impl Write,
1250) -> io::Result<()> {
1251    let mut src = alloc_uninit_vec(STREAM_BUF);
1252    let mut dst = alloc_uninit_vec(STREAM_BUF);
1253    loop {
1254        let n = read_full(reader, &mut src)?;
1255        if n == 0 {
1256            break;
1257        }
1258        let wp = delete_range_chunk(&src[..n], &mut dst, lo, hi);
1259        if wp == n {
1260            // No bytes deleted — write source directly (avoids copy overhead)
1261            writer.write_all(&src[..n])?;
1262        } else if wp > 0 {
1263            writer.write_all(&dst[..wp])?;
1264        }
1265    }
1266    Ok(())
1267}
1268
1269// ============================================================================
1270// Streaming functions (Read + Write)
1271// ============================================================================
1272
1273pub fn translate(
1274    set1: &[u8],
1275    set2: &[u8],
1276    reader: &mut impl Read,
1277    writer: &mut impl Write,
1278) -> io::Result<()> {
1279    let table = build_translate_table(set1, set2);
1280
1281    // Check for identity table — pure passthrough (no transformation needed)
1282    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
1283    if is_identity {
1284        return passthrough_stream(reader, writer);
1285    }
1286
1287    // Try SIMD fast path for constant-offset range translations (in-place, single buffer)
1288    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
1289        return translate_range_stream(lo, hi, offset, reader, writer);
1290    }
1291
1292    // Try SIMD fast path for range-to-constant translations (e.g., '\000-\037' -> 'X').
1293    // Uses blendv (5 SIMD ops/32 bytes) instead of nibble decomposition (48 ops/32 bytes).
1294    if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
1295        return translate_range_to_constant_stream(lo, hi, replacement, reader, writer);
1296    }
1297
1298    // General case: IN-PLACE translation on a SINGLE 16MB buffer.
1299    // This halves memory bandwidth vs the old separate src/dst approach:
1300    // - Old: read into src, translate from src→dst (read + write), write dst = 12MB bandwidth
1301    // - New: read into buf, translate in-place (read+write), write buf = 8MB bandwidth
1302    // The 8x-unrolled in-place translate avoids store-to-load forwarding stalls
1303    // because consecutive reads are 8 bytes apart (sequential), not aliased.
1304    // Using 16MB buffer = 1 read for 10MB input, minimizing syscall count.
1305    // SAFETY: all bytes are written by read_full before being translated.
1306    let mut buf = alloc_uninit_vec(STREAM_BUF);
1307    loop {
1308        let n = read_full(reader, &mut buf)?;
1309        if n == 0 {
1310            break;
1311        }
1312        translate_inplace(&mut buf[..n], &table);
1313        writer.write_all(&buf[..n])?;
1314    }
1315    Ok(())
1316}
1317
1318/// Streaming SIMD range translation — single buffer, in-place transform.
1319/// Uses 16MB uninit buffer for fewer syscalls (translate is compute-light).
1320fn translate_range_stream(
1321    lo: u8,
1322    hi: u8,
1323    offset: i8,
1324    reader: &mut impl Read,
1325    writer: &mut impl Write,
1326) -> io::Result<()> {
1327    let mut buf = alloc_uninit_vec(STREAM_BUF);
1328    loop {
1329        let n = read_full(reader, &mut buf)?;
1330        if n == 0 {
1331            break;
1332        }
1333        translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
1334        writer.write_all(&buf[..n])?;
1335    }
1336    Ok(())
1337}
1338
1339/// Streaming SIMD range-to-constant translation — single buffer, in-place transform.
1340/// Uses blendv instead of nibble decomposition for ~10x fewer SIMD ops per vector.
1341fn translate_range_to_constant_stream(
1342    lo: u8,
1343    hi: u8,
1344    replacement: u8,
1345    reader: &mut impl Read,
1346    writer: &mut impl Write,
1347) -> io::Result<()> {
1348    let mut buf = alloc_uninit_vec(STREAM_BUF);
1349    loop {
1350        let n = read_full(reader, &mut buf)?;
1351        if n == 0 {
1352            break;
1353        }
1354        translate_range_to_constant_simd_inplace(&mut buf[..n], lo, hi, replacement);
1355        writer.write_all(&buf[..n])?;
1356    }
1357    Ok(())
1358}
1359
1360/// Pure passthrough: copy stdin to stdout without transformation.
1361/// Uses a single 16MB uninit buffer with direct read/write, no processing overhead.
1362fn passthrough_stream(reader: &mut impl Read, writer: &mut impl Write) -> io::Result<()> {
1363    let mut buf = alloc_uninit_vec(STREAM_BUF);
1364    loop {
1365        let n = read_full(reader, &mut buf)?;
1366        if n == 0 {
1367            break;
1368        }
1369        writer.write_all(&buf[..n])?;
1370    }
1371    Ok(())
1372}
1373
1374/// Read as many bytes as possible into buf, retrying on partial reads.
1375/// Fast path: first read() often fills the entire buffer for regular files.
1376#[inline]
1377fn read_full(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
1378    // Fast path: first read() usually fills the entire buffer for regular files
1379    let n = reader.read(buf)?;
1380    if n == buf.len() || n == 0 {
1381        return Ok(n);
1382    }
1383    // Slow path: partial read — retry to fill buffer (pipes, slow devices)
1384    let mut total = n;
1385    while total < buf.len() {
1386        match reader.read(&mut buf[total..]) {
1387            Ok(0) => break,
1388            Ok(n) => total += n,
1389            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
1390            Err(e) => return Err(e),
1391        }
1392    }
1393    Ok(total)
1394}
1395
1396pub fn translate_squeeze(
1397    set1: &[u8],
1398    set2: &[u8],
1399    reader: &mut impl Read,
1400    writer: &mut impl Write,
1401) -> io::Result<()> {
1402    let table = build_translate_table(set1, set2);
1403    let squeeze_set = build_member_set(set2);
1404
1405    // Two-pass optimization for range translations:
1406    // Pass 1: SIMD range translate in-place (10x faster than scalar table lookup)
1407    // Pass 2: scalar squeeze (inherently sequential due to state dependency)
1408    // Even though it's two passes, the translate pass is so much faster with SIMD
1409    // that the total is still a net win.
1410    let range_info = detect_range_offset(&table);
1411    let range_const_info = if range_info.is_none() {
1412        detect_range_to_constant(&table)
1413    } else {
1414        None
1415    };
1416
1417    let mut buf = alloc_uninit_vec(STREAM_BUF);
1418    let mut last_squeezed: u16 = 256;
1419
1420    loop {
1421        let n = read_full(reader, &mut buf)?;
1422        if n == 0 {
1423            break;
1424        }
1425        // Pass 1: translate
1426        if let Some((lo, hi, offset)) = range_info {
1427            translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
1428        } else if let Some((lo, hi, replacement)) = range_const_info {
1429            translate_range_to_constant_simd_inplace(&mut buf[..n], lo, hi, replacement);
1430        } else {
1431            translate_inplace(&mut buf[..n], &table);
1432        }
1433        // Pass 2: squeeze in-place
1434        let mut wp = 0;
1435        unsafe {
1436            let ptr = buf.as_mut_ptr();
1437            for i in 0..n {
1438                let b = *ptr.add(i);
1439                if is_member(&squeeze_set, b) {
1440                    if last_squeezed == b as u16 {
1441                        continue;
1442                    }
1443                    last_squeezed = b as u16;
1444                } else {
1445                    last_squeezed = 256;
1446                }
1447                *ptr.add(wp) = b;
1448                wp += 1;
1449            }
1450        }
1451        writer.write_all(&buf[..wp])?;
1452    }
1453    Ok(())
1454}
1455
1456pub fn delete(
1457    delete_chars: &[u8],
1458    reader: &mut impl Read,
1459    writer: &mut impl Write,
1460) -> io::Result<()> {
1461    if delete_chars.len() == 1 {
1462        return delete_single_streaming(delete_chars[0], reader, writer);
1463    }
1464    if delete_chars.len() <= 3 {
1465        return delete_multi_streaming(delete_chars, reader, writer);
1466    }
1467
1468    // SIMD fast path: if all delete chars form a contiguous range [lo..=hi],
1469    // use vectorized range comparison instead of scalar bitset lookup.
1470    // This covers [:digit:] (0x30-0x39), a-z, A-Z, etc.
1471    if let Some((lo, hi)) = detect_delete_range(delete_chars) {
1472        return delete_range_streaming(lo, hi, reader, writer);
1473    }
1474
1475    let member = build_member_set(delete_chars);
1476    let mut buf = alloc_uninit_vec(STREAM_BUF);
1477
1478    loop {
1479        let n = read_full(reader, &mut buf)?;
1480        if n == 0 {
1481            break;
1482        }
1483        let mut wp = 0;
1484        unsafe {
1485            let ptr = buf.as_mut_ptr();
1486            let mut i = 0;
1487            while i + 8 <= n {
1488                let b0 = *ptr.add(i);
1489                let b1 = *ptr.add(i + 1);
1490                let b2 = *ptr.add(i + 2);
1491                let b3 = *ptr.add(i + 3);
1492                let b4 = *ptr.add(i + 4);
1493                let b5 = *ptr.add(i + 5);
1494                let b6 = *ptr.add(i + 6);
1495                let b7 = *ptr.add(i + 7);
1496
1497                // Branchless: write byte then conditionally advance pointer.
1498                // Avoids branch mispredictions when most bytes are kept.
1499                *ptr.add(wp) = b0;
1500                wp += !is_member(&member, b0) as usize;
1501                *ptr.add(wp) = b1;
1502                wp += !is_member(&member, b1) as usize;
1503                *ptr.add(wp) = b2;
1504                wp += !is_member(&member, b2) as usize;
1505                *ptr.add(wp) = b3;
1506                wp += !is_member(&member, b3) as usize;
1507                *ptr.add(wp) = b4;
1508                wp += !is_member(&member, b4) as usize;
1509                *ptr.add(wp) = b5;
1510                wp += !is_member(&member, b5) as usize;
1511                *ptr.add(wp) = b6;
1512                wp += !is_member(&member, b6) as usize;
1513                *ptr.add(wp) = b7;
1514                wp += !is_member(&member, b7) as usize;
1515                i += 8;
1516            }
1517            while i < n {
1518                let b = *ptr.add(i);
1519                *ptr.add(wp) = b;
1520                wp += !is_member(&member, b) as usize;
1521                i += 1;
1522            }
1523        }
1524        writer.write_all(&buf[..wp])?;
1525    }
1526    Ok(())
1527}
1528
1529fn delete_single_streaming(
1530    ch: u8,
1531    reader: &mut impl Read,
1532    writer: &mut impl Write,
1533) -> io::Result<()> {
1534    let mut src = alloc_uninit_vec(STREAM_BUF);
1535    let mut dst = alloc_uninit_vec(STREAM_BUF);
1536    loop {
1537        let n = read_full(reader, &mut src)?;
1538        if n == 0 {
1539            break;
1540        }
1541        // Use memchr to find byte positions, gap-copy to separate dst buffer.
1542        // Separate src/dst allows copy_nonoverlapping (faster than memmove)
1543        // and avoids aliasing concerns in the hot loop.
1544        let mut wp = 0;
1545        let mut i = 0;
1546        while i < n {
1547            match memchr::memchr(ch, &src[i..n]) {
1548                Some(offset) => {
1549                    if offset > 0 {
1550                        unsafe {
1551                            std::ptr::copy_nonoverlapping(
1552                                src.as_ptr().add(i),
1553                                dst.as_mut_ptr().add(wp),
1554                                offset,
1555                            );
1556                        }
1557                        wp += offset;
1558                    }
1559                    i += offset + 1;
1560                }
1561                None => {
1562                    let run_len = n - i;
1563                    if run_len > 0 {
1564                        unsafe {
1565                            std::ptr::copy_nonoverlapping(
1566                                src.as_ptr().add(i),
1567                                dst.as_mut_ptr().add(wp),
1568                                run_len,
1569                            );
1570                        }
1571                        wp += run_len;
1572                    }
1573                    break;
1574                }
1575            }
1576        }
1577        // If nothing was deleted, write from src directly (avoids extra copy)
1578        if wp == n {
1579            writer.write_all(&src[..n])?;
1580        } else if wp > 0 {
1581            writer.write_all(&dst[..wp])?;
1582        }
1583    }
1584    Ok(())
1585}
1586
1587fn delete_multi_streaming(
1588    chars: &[u8],
1589    reader: &mut impl Read,
1590    writer: &mut impl Write,
1591) -> io::Result<()> {
1592    let mut src = alloc_uninit_vec(STREAM_BUF);
1593    let mut dst = alloc_uninit_vec(STREAM_BUF);
1594    loop {
1595        let n = read_full(reader, &mut src)?;
1596        if n == 0 {
1597            break;
1598        }
1599        // Use memchr2/memchr3 to find byte positions, gap-copy to separate dst buffer.
1600        // Separate src/dst allows copy_nonoverlapping (faster than memmove).
1601        let mut wp = 0;
1602        let mut i = 0;
1603        while i < n {
1604            let found = if chars.len() == 2 {
1605                memchr::memchr2(chars[0], chars[1], &src[i..n])
1606            } else {
1607                memchr::memchr3(chars[0], chars[1], chars[2], &src[i..n])
1608            };
1609            match found {
1610                Some(offset) => {
1611                    if offset > 0 {
1612                        unsafe {
1613                            std::ptr::copy_nonoverlapping(
1614                                src.as_ptr().add(i),
1615                                dst.as_mut_ptr().add(wp),
1616                                offset,
1617                            );
1618                        }
1619                        wp += offset;
1620                    }
1621                    i += offset + 1;
1622                }
1623                None => {
1624                    let run_len = n - i;
1625                    if run_len > 0 {
1626                        unsafe {
1627                            std::ptr::copy_nonoverlapping(
1628                                src.as_ptr().add(i),
1629                                dst.as_mut_ptr().add(wp),
1630                                run_len,
1631                            );
1632                        }
1633                        wp += run_len;
1634                    }
1635                    break;
1636                }
1637            }
1638        }
1639        if wp == n {
1640            writer.write_all(&src[..n])?;
1641        } else if wp > 0 {
1642            writer.write_all(&dst[..wp])?;
1643        }
1644    }
1645    Ok(())
1646}
1647
1648pub fn delete_squeeze(
1649    delete_chars: &[u8],
1650    squeeze_chars: &[u8],
1651    reader: &mut impl Read,
1652    writer: &mut impl Write,
1653) -> io::Result<()> {
1654    let delete_set = build_member_set(delete_chars);
1655    let squeeze_set = build_member_set(squeeze_chars);
1656    let mut buf = alloc_uninit_vec(STREAM_BUF);
1657    let mut last_squeezed: u16 = 256;
1658
1659    loop {
1660        let n = read_full(reader, &mut buf)?;
1661        if n == 0 {
1662            break;
1663        }
1664        let mut wp = 0;
1665        unsafe {
1666            let ptr = buf.as_mut_ptr();
1667            for i in 0..n {
1668                let b = *ptr.add(i);
1669                if is_member(&delete_set, b) {
1670                    continue;
1671                }
1672                if is_member(&squeeze_set, b) {
1673                    if last_squeezed == b as u16 {
1674                        continue;
1675                    }
1676                    last_squeezed = b as u16;
1677                } else {
1678                    last_squeezed = 256;
1679                }
1680                *ptr.add(wp) = b;
1681                wp += 1;
1682            }
1683        }
1684        writer.write_all(&buf[..wp])?;
1685    }
1686    Ok(())
1687}
1688
1689pub fn squeeze(
1690    squeeze_chars: &[u8],
1691    reader: &mut impl Read,
1692    writer: &mut impl Write,
1693) -> io::Result<()> {
1694    if squeeze_chars.len() == 1 {
1695        return squeeze_single_stream(squeeze_chars[0], reader, writer);
1696    }
1697
1698    let member = build_member_set(squeeze_chars);
1699    let mut buf = alloc_uninit_vec(STREAM_BUF);
1700    let mut last_squeezed: u16 = 256;
1701
1702    loop {
1703        let n = read_full(reader, &mut buf)?;
1704        if n == 0 {
1705            break;
1706        }
1707        let mut wp = 0;
1708        unsafe {
1709            let ptr = buf.as_mut_ptr();
1710            for i in 0..n {
1711                let b = *ptr.add(i);
1712                if is_member(&member, b) {
1713                    if last_squeezed == b as u16 {
1714                        continue;
1715                    }
1716                    last_squeezed = b as u16;
1717                } else {
1718                    last_squeezed = 256;
1719                }
1720                *ptr.add(wp) = b;
1721                wp += 1;
1722            }
1723        }
1724        writer.write_all(&buf[..wp])?;
1725    }
1726    Ok(())
1727}
1728
1729fn squeeze_single_stream(
1730    ch: u8,
1731    reader: &mut impl Read,
1732    writer: &mut impl Write,
1733) -> io::Result<()> {
1734    // Use a two-byte pattern finder (ch,ch) to jump directly to squeeze points.
1735    // For squeeze-spaces: most of the data has no consecutive spaces, so memmem
1736    // skips over huge regions at SIMD speed. When a pair is found, we scan the
1737    // run length and collapse it to one occurrence.
1738    let pair = [ch, ch];
1739    let finder = memchr::memmem::Finder::new(&pair);
1740    let mut buf = alloc_uninit_vec(STREAM_BUF);
1741    let mut was_squeeze_char = false;
1742
1743    loop {
1744        let n = read_full(reader, &mut buf)?;
1745        if n == 0 {
1746            break;
1747        }
1748
1749        let ptr = buf.as_mut_ptr();
1750        let mut wp = 0;
1751        let mut i = 0;
1752
1753        // Handle carry-over: if previous chunk ended with squeeze char,
1754        // skip leading occurrences of that char in this chunk.
1755        if was_squeeze_char {
1756            while i < n && unsafe { *ptr.add(i) } == ch {
1757                i += 1;
1758            }
1759            if i >= n {
1760                // Entire chunk is squeeze-char continuation
1761                // was_squeeze_char remains true
1762                continue;
1763            }
1764        }
1765
1766        // Use memmem to find consecutive pairs (ch, ch) — SIMD-accelerated.
1767        // Between pairs, the data passes through unchanged.
1768        loop {
1769            match finder.find(&buf[i..n]) {
1770                Some(offset) => {
1771                    // Copy everything up to and including the first char of the pair
1772                    let copy_len = offset + 1; // include first ch
1773                    if copy_len > 0 && wp != i {
1774                        unsafe {
1775                            std::ptr::copy(ptr.add(i), ptr.add(wp), copy_len);
1776                        }
1777                    }
1778                    wp += copy_len;
1779                    i += offset + 1; // position after first ch of pair
1780                    // Skip all remaining consecutive ch bytes (the run)
1781                    while i < n && unsafe { *ptr.add(i) } == ch {
1782                        i += 1;
1783                    }
1784                    if i >= n {
1785                        was_squeeze_char = true;
1786                        break;
1787                    }
1788                }
1789                None => {
1790                    // No more consecutive pairs — copy remainder
1791                    let run_len = n - i;
1792                    if run_len > 0 && wp != i {
1793                        unsafe {
1794                            std::ptr::copy(ptr.add(i), ptr.add(wp), run_len);
1795                        }
1796                    }
1797                    wp += run_len;
1798                    // Check if chunk ends with the squeeze char
1799                    was_squeeze_char = n > 0 && unsafe { *ptr.add(n - 1) } == ch;
1800                    break;
1801                }
1802            }
1803        }
1804
1805        writer.write_all(&buf[..wp])?;
1806    }
1807    Ok(())
1808}
1809
1810// ============================================================================
1811// Batch in-place functions (owned data from piped stdin)
1812// ============================================================================
1813
1814/// Translate bytes in-place on an owned buffer, then write.
1815/// For piped stdin where we own the data, this avoids the separate output buffer
1816/// allocation needed by translate_mmap. Uses parallel in-place SIMD for large data.
1817pub fn translate_owned(
1818    set1: &[u8],
1819    set2: &[u8],
1820    data: &mut [u8],
1821    writer: &mut impl Write,
1822) -> io::Result<()> {
1823    let table = build_translate_table(set1, set2);
1824
1825    // Identity table — pure passthrough
1826    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
1827    if is_identity {
1828        return writer.write_all(data);
1829    }
1830
1831    // SIMD range fast path (in-place)
1832    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
1833        if data.len() >= PARALLEL_THRESHOLD {
1834            let n_threads = rayon::current_num_threads().max(1);
1835            let chunk_size = (data.len() / n_threads).max(32 * 1024);
1836            data.par_chunks_mut(chunk_size).for_each(|chunk| {
1837                translate_range_simd_inplace(chunk, lo, hi, offset);
1838            });
1839        } else {
1840            translate_range_simd_inplace(data, lo, hi, offset);
1841        }
1842        return writer.write_all(data);
1843    }
1844
1845    // SIMD range-to-constant fast path (in-place)
1846    if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
1847        if data.len() >= PARALLEL_THRESHOLD {
1848            let n_threads = rayon::current_num_threads().max(1);
1849            let chunk_size = (data.len() / n_threads).max(32 * 1024);
1850            data.par_chunks_mut(chunk_size).for_each(|chunk| {
1851                translate_range_to_constant_simd_inplace(chunk, lo, hi, replacement);
1852            });
1853        } else {
1854            translate_range_to_constant_simd_inplace(data, lo, hi, replacement);
1855        }
1856        return writer.write_all(data);
1857    }
1858
1859    // General table lookup (in-place)
1860    if data.len() >= PARALLEL_THRESHOLD {
1861        let n_threads = rayon::current_num_threads().max(1);
1862        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1863        data.par_chunks_mut(chunk_size).for_each(|chunk| {
1864            translate_inplace(chunk, &table);
1865        });
1866    } else {
1867        translate_inplace(data, &table);
1868    }
1869    writer.write_all(data)
1870}
1871
1872// ============================================================================
1873// Mmap-based functions (zero-copy input from byte slice)
1874// ============================================================================
1875
1876/// Maximum data size for single-allocation translate approach.
1877/// Below this limit, translate ALL data into one buffer and do a single write_all.
1878/// Above this, use chunked approach to limit memory usage.
1879const SINGLE_WRITE_LIMIT: usize = 16 * 1024 * 1024;
1880
1881/// Translate bytes from an mmap'd byte slice.
1882/// Detects single-range translations (e.g., a-z to A-Z) and uses SIMD vectorized
1883/// arithmetic (AVX2: 32 bytes/iter, SSE2: 16 bytes/iter) for those cases.
1884/// Falls back to scalar 256-byte table lookup for general translations.
1885///
1886/// For data >= 2MB: uses rayon parallel processing across multiple cores.
1887/// For data <= 16MB: single allocation + single write_all (1 syscall).
1888/// For data > 16MB: chunked approach to limit memory (N syscalls where N = data/4MB).
1889pub fn translate_mmap(
1890    set1: &[u8],
1891    set2: &[u8],
1892    data: &[u8],
1893    writer: &mut impl Write,
1894) -> io::Result<()> {
1895    let table = build_translate_table(set1, set2);
1896
1897    // Check if table is identity — pure passthrough
1898    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
1899    if is_identity {
1900        return writer.write_all(data);
1901    }
1902
1903    // Try SIMD fast path for single-range constant-offset translations
1904    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
1905        return translate_mmap_range(data, writer, lo, hi, offset);
1906    }
1907
1908    // Try SIMD fast path for range-to-constant translations
1909    if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
1910        return translate_mmap_range_to_constant(data, writer, lo, hi, replacement);
1911    }
1912
1913    // General case: table lookup (with parallel processing for large data)
1914    translate_mmap_table(data, writer, &table)
1915}
1916
1917/// SIMD range translate for mmap data, with rayon parallel processing.
1918fn translate_mmap_range(
1919    data: &[u8],
1920    writer: &mut impl Write,
1921    lo: u8,
1922    hi: u8,
1923    offset: i8,
1924) -> io::Result<()> {
1925    // Parallel path: split data into chunks, translate each in parallel
1926    if data.len() >= PARALLEL_THRESHOLD {
1927        let mut buf = alloc_uninit_vec(data.len());
1928        let n_threads = rayon::current_num_threads().max(1);
1929        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1930
1931        // Process chunks in parallel: each thread writes to its slice of buf
1932        data.par_chunks(chunk_size)
1933            .zip(buf.par_chunks_mut(chunk_size))
1934            .for_each(|(src_chunk, dst_chunk)| {
1935                translate_range_simd(src_chunk, &mut dst_chunk[..src_chunk.len()], lo, hi, offset);
1936            });
1937
1938        return writer.write_all(&buf);
1939    }
1940
1941    // Small data: single-threaded SIMD
1942    if data.len() <= SINGLE_WRITE_LIMIT {
1943        let mut buf = alloc_uninit_vec(data.len());
1944        translate_range_simd(data, &mut buf, lo, hi, offset);
1945        return writer.write_all(&buf);
1946    }
1947    // Chunked path for large data (shouldn't happen since PARALLEL_THRESHOLD < SINGLE_WRITE_LIMIT)
1948    let mut buf = alloc_uninit_vec(BUF_SIZE);
1949    for chunk in data.chunks(BUF_SIZE) {
1950        translate_range_simd(chunk, &mut buf[..chunk.len()], lo, hi, offset);
1951        writer.write_all(&buf[..chunk.len()])?;
1952    }
1953    Ok(())
1954}
1955
1956/// SIMD range-to-constant translate for mmap data.
1957/// Uses blendv (5 SIMD ops/32 bytes) for range-to-constant patterns.
1958fn translate_mmap_range_to_constant(
1959    data: &[u8],
1960    writer: &mut impl Write,
1961    lo: u8,
1962    hi: u8,
1963    replacement: u8,
1964) -> io::Result<()> {
1965    // For mmap data (read-only), copy to buffer and translate in-place
1966    if data.len() >= PARALLEL_THRESHOLD {
1967        let mut buf = alloc_uninit_vec(data.len());
1968        let n_threads = rayon::current_num_threads().max(1);
1969        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1970
1971        // Copy + translate in parallel
1972        data.par_chunks(chunk_size)
1973            .zip(buf.par_chunks_mut(chunk_size))
1974            .for_each(|(src_chunk, dst_chunk)| {
1975                dst_chunk[..src_chunk.len()].copy_from_slice(src_chunk);
1976                translate_range_to_constant_simd_inplace(
1977                    &mut dst_chunk[..src_chunk.len()],
1978                    lo,
1979                    hi,
1980                    replacement,
1981                );
1982            });
1983
1984        return writer.write_all(&buf);
1985    }
1986
1987    if data.len() <= SINGLE_WRITE_LIMIT {
1988        let mut buf = data.to_vec();
1989        translate_range_to_constant_simd_inplace(&mut buf, lo, hi, replacement);
1990        return writer.write_all(&buf);
1991    }
1992    let mut buf = alloc_uninit_vec(BUF_SIZE);
1993    for chunk in data.chunks(BUF_SIZE) {
1994        buf[..chunk.len()].copy_from_slice(chunk);
1995        translate_range_to_constant_simd_inplace(&mut buf[..chunk.len()], lo, hi, replacement);
1996        writer.write_all(&buf[..chunk.len()])?;
1997    }
1998    Ok(())
1999}
2000
2001/// General table-lookup translate for mmap data, with rayon parallel processing.
2002fn translate_mmap_table(data: &[u8], writer: &mut impl Write, table: &[u8; 256]) -> io::Result<()> {
2003    // Parallel path: split data into chunks, translate each in parallel
2004    if data.len() >= PARALLEL_THRESHOLD {
2005        let mut buf = alloc_uninit_vec(data.len());
2006        let n_threads = rayon::current_num_threads().max(1);
2007        let chunk_size = (data.len() / n_threads).max(32 * 1024);
2008
2009        data.par_chunks(chunk_size)
2010            .zip(buf.par_chunks_mut(chunk_size))
2011            .for_each(|(src_chunk, dst_chunk)| {
2012                translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], table);
2013            });
2014
2015        return writer.write_all(&buf);
2016    }
2017
2018    // Small data: single-threaded
2019    if data.len() <= SINGLE_WRITE_LIMIT {
2020        let mut buf = alloc_uninit_vec(data.len());
2021        translate_to(data, &mut buf, table);
2022        return writer.write_all(&buf);
2023    }
2024    let mut buf = alloc_uninit_vec(BUF_SIZE);
2025    for chunk in data.chunks(BUF_SIZE) {
2026        translate_to(chunk, &mut buf[..chunk.len()], table);
2027        writer.write_all(&buf[..chunk.len()])?;
2028    }
2029    Ok(())
2030}
2031
2032/// Translate + squeeze from mmap'd byte slice.
2033///
2034/// For data >= 2MB: two-phase approach: parallel translate, then sequential squeeze.
2035/// For data <= 16MB: single-pass translate+squeeze into one buffer, one write syscall.
2036/// For data > 16MB: chunked approach to limit memory.
2037pub fn translate_squeeze_mmap(
2038    set1: &[u8],
2039    set2: &[u8],
2040    data: &[u8],
2041    writer: &mut impl Write,
2042) -> io::Result<()> {
2043    let table = build_translate_table(set1, set2);
2044    let squeeze_set = build_member_set(set2);
2045
2046    // For large data: two-phase approach
2047    // Phase 1: parallel translate into buffer
2048    // Phase 2: sequential squeeze IN-PLACE on the translated buffer
2049    //          (squeeze only removes bytes, never grows, so no second allocation needed)
2050    if data.len() >= PARALLEL_THRESHOLD {
2051        // Phase 1: parallel translate
2052        let mut translated = alloc_uninit_vec(data.len());
2053        let range_info = detect_range_offset(&table);
2054        let n_threads = rayon::current_num_threads().max(1);
2055        let chunk_size = (data.len() / n_threads).max(32 * 1024);
2056
2057        if let Some((lo, hi, offset)) = range_info {
2058            data.par_chunks(chunk_size)
2059                .zip(translated.par_chunks_mut(chunk_size))
2060                .for_each(|(src_chunk, dst_chunk)| {
2061                    translate_range_simd(
2062                        src_chunk,
2063                        &mut dst_chunk[..src_chunk.len()],
2064                        lo,
2065                        hi,
2066                        offset,
2067                    );
2068                });
2069        } else {
2070            data.par_chunks(chunk_size)
2071                .zip(translated.par_chunks_mut(chunk_size))
2072                .for_each(|(src_chunk, dst_chunk)| {
2073                    translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], &table);
2074                });
2075        }
2076
2077        // Phase 2: squeeze in-place on the translated buffer.
2078        // Since squeeze only removes bytes (never grows), we can read ahead and
2079        // compact into the same buffer, saving a full data.len() heap allocation.
2080        let mut last_squeezed: u16 = 256;
2081        let len = translated.len();
2082        let mut wp = 0;
2083        unsafe {
2084            let ptr = translated.as_mut_ptr();
2085            let mut i = 0;
2086            while i < len {
2087                let b = *ptr.add(i);
2088                if is_member(&squeeze_set, b) {
2089                    if last_squeezed == b as u16 {
2090                        i += 1;
2091                        continue;
2092                    }
2093                    last_squeezed = b as u16;
2094                } else {
2095                    last_squeezed = 256;
2096                }
2097                *ptr.add(wp) = b;
2098                wp += 1;
2099                i += 1;
2100            }
2101        }
2102        return writer.write_all(&translated[..wp]);
2103    }
2104
2105    // Single-write fast path: translate+squeeze all data in one pass, one write
2106    if data.len() <= SINGLE_WRITE_LIMIT {
2107        let mut buf: Vec<u8> = Vec::with_capacity(data.len());
2108        let mut last_squeezed: u16 = 256;
2109        unsafe {
2110            buf.set_len(data.len());
2111            let outp: *mut u8 = buf.as_mut_ptr();
2112            let inp = data.as_ptr();
2113            let len = data.len();
2114            let mut wp = 0;
2115            let mut i = 0;
2116            while i < len {
2117                let translated = *table.get_unchecked(*inp.add(i) as usize);
2118                if is_member(&squeeze_set, translated) {
2119                    if last_squeezed == translated as u16 {
2120                        i += 1;
2121                        continue;
2122                    }
2123                    last_squeezed = translated as u16;
2124                } else {
2125                    last_squeezed = 256;
2126                }
2127                *outp.add(wp) = translated;
2128                wp += 1;
2129                i += 1;
2130            }
2131            buf.set_len(wp);
2132        }
2133        return writer.write_all(&buf);
2134    }
2135
2136    // Chunked path for large data
2137    let buf_size = data.len().min(BUF_SIZE);
2138    let mut buf = vec![0u8; buf_size];
2139    let mut last_squeezed: u16 = 256;
2140
2141    for chunk in data.chunks(buf_size) {
2142        translate_to(chunk, &mut buf[..chunk.len()], &table);
2143        let mut wp = 0;
2144        unsafe {
2145            let ptr = buf.as_mut_ptr();
2146            for i in 0..chunk.len() {
2147                let b = *ptr.add(i);
2148                if is_member(&squeeze_set, b) {
2149                    if last_squeezed == b as u16 {
2150                        continue;
2151                    }
2152                    last_squeezed = b as u16;
2153                } else {
2154                    last_squeezed = 256;
2155                }
2156                *ptr.add(wp) = b;
2157                wp += 1;
2158            }
2159        }
2160        writer.write_all(&buf[..wp])?;
2161    }
2162    Ok(())
2163}
2164
2165/// Delete from mmap'd byte slice.
2166///
2167/// For data >= 2MB: uses rayon parallel processing across multiple cores.
2168/// For data <= 16MB: delete into one buffer, one write syscall.
2169/// For data > 16MB: chunked approach to limit memory.
2170pub fn delete_mmap(delete_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2171    if delete_chars.len() == 1 {
2172        return delete_single_char_mmap(delete_chars[0], data, writer);
2173    }
2174    if delete_chars.len() <= 3 {
2175        return delete_multi_memchr_mmap(delete_chars, data, writer);
2176    }
2177
2178    // SIMD fast path for contiguous ranges (digits, a-z, A-Z, etc.)
2179    if let Some((lo, hi)) = detect_delete_range(delete_chars) {
2180        return delete_range_mmap(data, writer, lo, hi);
2181    }
2182
2183    let member = build_member_set(delete_chars);
2184
2185    // Parallel path: pre-allocate a single output buffer of data.len() and have each
2186    // thread write to its non-overlapping slice, then do a single write_all.
2187    // This avoids per-chunk Vec allocations that the old approach had.
2188    if data.len() >= PARALLEL_THRESHOLD {
2189        let n_threads = rayon::current_num_threads().max(1);
2190        let chunk_size = (data.len() / n_threads).max(32 * 1024);
2191
2192        // Each thread deletes into its slice of outbuf and returns bytes written.
2193        let mut outbuf = alloc_uninit_vec(data.len());
2194        let chunk_lens: Vec<usize> = data
2195            .par_chunks(chunk_size)
2196            .zip(outbuf.par_chunks_mut(chunk_size))
2197            .map(|(src_chunk, dst_chunk)| delete_chunk_bitset_into(src_chunk, &member, dst_chunk))
2198            .collect();
2199
2200        // Compact: move each chunk's output to be contiguous.
2201        // chunk_lens[i] is how many bytes thread i wrote into its slice.
2202        // We need to shift them together since each dst_chunk started at chunk_size offsets.
2203        let mut write_pos = 0;
2204        let mut src_offset = 0;
2205        for &clen in &chunk_lens {
2206            if clen > 0 && src_offset != write_pos {
2207                unsafe {
2208                    std::ptr::copy(
2209                        outbuf.as_ptr().add(src_offset),
2210                        outbuf.as_mut_ptr().add(write_pos),
2211                        clen,
2212                    );
2213                }
2214            }
2215            write_pos += clen;
2216            src_offset += chunk_size;
2217        }
2218
2219        return writer.write_all(&outbuf[..write_pos]);
2220    }
2221
2222    // Single-write fast path: delete into one buffer, one write
2223    if data.len() <= SINGLE_WRITE_LIMIT {
2224        let mut outbuf = alloc_uninit_vec(data.len());
2225        let out_pos = delete_chunk_bitset_into(data, &member, &mut outbuf);
2226        return writer.write_all(&outbuf[..out_pos]);
2227    }
2228
2229    // Chunked path for large data
2230    let buf_size = data.len().min(BUF_SIZE);
2231    let mut outbuf = alloc_uninit_vec(buf_size);
2232
2233    for chunk in data.chunks(buf_size) {
2234        let out_pos = delete_chunk_bitset_into(chunk, &member, &mut outbuf);
2235        writer.write_all(&outbuf[..out_pos])?;
2236    }
2237    Ok(())
2238}
2239
2240/// SIMD range delete for mmap data, with rayon parallel processing.
2241fn delete_range_mmap(data: &[u8], writer: &mut impl Write, lo: u8, hi: u8) -> io::Result<()> {
2242    // Parallel path: each thread deletes from its chunk into a local Vec
2243    if data.len() >= PARALLEL_THRESHOLD {
2244        let n_threads = rayon::current_num_threads().max(1);
2245        let chunk_size = (data.len() / n_threads).max(32 * 1024);
2246
2247        let results: Vec<Vec<u8>> = data
2248            .par_chunks(chunk_size)
2249            .map(|chunk| {
2250                let mut out = alloc_uninit_vec(chunk.len());
2251                let wp = delete_range_chunk(chunk, &mut out, lo, hi);
2252                unsafe { out.set_len(wp) };
2253                out
2254            })
2255            .collect();
2256
2257        let slices: Vec<std::io::IoSlice> = results
2258            .iter()
2259            .filter(|r| !r.is_empty())
2260            .map(|r| std::io::IoSlice::new(r))
2261            .collect();
2262        return write_ioslices(writer, &slices);
2263    }
2264
2265    // Single-write fast path
2266    if data.len() <= SINGLE_WRITE_LIMIT {
2267        let mut outbuf = alloc_uninit_vec(data.len());
2268        let wp = delete_range_chunk(data, &mut outbuf, lo, hi);
2269        return writer.write_all(&outbuf[..wp]);
2270    }
2271
2272    // Chunked path
2273    let mut outbuf = alloc_uninit_vec(BUF_SIZE);
2274    for chunk in data.chunks(BUF_SIZE) {
2275        let wp = delete_range_chunk(chunk, &mut outbuf, lo, hi);
2276        writer.write_all(&outbuf[..wp])?;
2277    }
2278    Ok(())
2279}
2280
2281/// Delete bytes from chunk using bitset, writing into pre-allocated buffer.
2282/// Returns number of bytes written.
2283#[inline]
2284fn delete_chunk_bitset_into(chunk: &[u8], member: &[u8; 32], outbuf: &mut [u8]) -> usize {
2285    let len = chunk.len();
2286    let mut out_pos = 0;
2287    let mut i = 0;
2288
2289    while i + 8 <= len {
2290        unsafe {
2291            let b0 = *chunk.get_unchecked(i);
2292            let b1 = *chunk.get_unchecked(i + 1);
2293            let b2 = *chunk.get_unchecked(i + 2);
2294            let b3 = *chunk.get_unchecked(i + 3);
2295            let b4 = *chunk.get_unchecked(i + 4);
2296            let b5 = *chunk.get_unchecked(i + 5);
2297            let b6 = *chunk.get_unchecked(i + 6);
2298            let b7 = *chunk.get_unchecked(i + 7);
2299
2300            *outbuf.get_unchecked_mut(out_pos) = b0;
2301            out_pos += !is_member(member, b0) as usize;
2302            *outbuf.get_unchecked_mut(out_pos) = b1;
2303            out_pos += !is_member(member, b1) as usize;
2304            *outbuf.get_unchecked_mut(out_pos) = b2;
2305            out_pos += !is_member(member, b2) as usize;
2306            *outbuf.get_unchecked_mut(out_pos) = b3;
2307            out_pos += !is_member(member, b3) as usize;
2308            *outbuf.get_unchecked_mut(out_pos) = b4;
2309            out_pos += !is_member(member, b4) as usize;
2310            *outbuf.get_unchecked_mut(out_pos) = b5;
2311            out_pos += !is_member(member, b5) as usize;
2312            *outbuf.get_unchecked_mut(out_pos) = b6;
2313            out_pos += !is_member(member, b6) as usize;
2314            *outbuf.get_unchecked_mut(out_pos) = b7;
2315            out_pos += !is_member(member, b7) as usize;
2316        }
2317        i += 8;
2318    }
2319
2320    while i < len {
2321        unsafe {
2322            let b = *chunk.get_unchecked(i);
2323            *outbuf.get_unchecked_mut(out_pos) = b;
2324            out_pos += !is_member(member, b) as usize;
2325        }
2326        i += 1;
2327    }
2328
2329    out_pos
2330}
2331
2332fn delete_single_char_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2333    // Zero-copy delete using writev: build IoSlice entries pointing to the
2334    // gaps between deleted characters in the ORIGINAL data buffer.
2335    // For `tr -d '\n'` on 10MB with ~200K newlines:
2336    //   - Old: 10MB allocation + 10MB copy into output buffer
2337    //   - New: ~200K * 16 = 3.2MB IoSlice entries, zero data copy
2338    // Uses SIMD memchr_iter to find all positions, then builds IoSlice spans.
2339    let mut iov: Vec<std::io::IoSlice> = Vec::new();
2340    let mut last = 0;
2341    for pos in memchr::memchr_iter(ch, data) {
2342        if pos > last {
2343            iov.push(std::io::IoSlice::new(&data[last..pos]));
2344        }
2345        last = pos + 1;
2346    }
2347    if last < data.len() {
2348        iov.push(std::io::IoSlice::new(&data[last..]));
2349    }
2350    write_ioslices(writer, &iov)
2351}
2352
2353fn delete_multi_memchr_mmap(chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2354    let c0 = chars[0];
2355    let c1 = if chars.len() >= 2 { chars[1] } else { 0 };
2356    let c2 = if chars.len() >= 3 { chars[2] } else { 0 };
2357    let is_three = chars.len() >= 3;
2358
2359    // Zero-copy delete using writev: build IoSlice entries pointing to the
2360    // gaps between deleted characters in the original data buffer.
2361    let mut iov: Vec<std::io::IoSlice> = Vec::new();
2362    let mut last = 0;
2363    if is_three {
2364        for pos in memchr::memchr3_iter(c0, c1, c2, data) {
2365            if pos > last {
2366                iov.push(std::io::IoSlice::new(&data[last..pos]));
2367            }
2368            last = pos + 1;
2369        }
2370    } else {
2371        for pos in memchr::memchr2_iter(c0, c1, data) {
2372            if pos > last {
2373                iov.push(std::io::IoSlice::new(&data[last..pos]));
2374            }
2375            last = pos + 1;
2376        }
2377    }
2378    if last < data.len() {
2379        iov.push(std::io::IoSlice::new(&data[last..]));
2380    }
2381    write_ioslices(writer, &iov)
2382}
2383
2384/// Delete + squeeze from mmap'd byte slice.
2385///
2386/// For data <= 16MB: delete+squeeze into one buffer, one write syscall.
2387/// For data > 16MB: chunked approach to limit memory.
2388pub fn delete_squeeze_mmap(
2389    delete_chars: &[u8],
2390    squeeze_chars: &[u8],
2391    data: &[u8],
2392    writer: &mut impl Write,
2393) -> io::Result<()> {
2394    let delete_set = build_member_set(delete_chars);
2395    let squeeze_set = build_member_set(squeeze_chars);
2396
2397    // Single-write fast path: delete+squeeze all data in one pass, one write
2398    if data.len() <= SINGLE_WRITE_LIMIT {
2399        let mut outbuf: Vec<u8> = Vec::with_capacity(data.len());
2400        let mut last_squeezed: u16 = 256;
2401        unsafe {
2402            outbuf.set_len(data.len());
2403            let outp: *mut u8 = outbuf.as_mut_ptr();
2404            let inp = data.as_ptr();
2405            let len = data.len();
2406            let mut out_pos = 0;
2407            let mut i = 0;
2408            while i < len {
2409                let b = *inp.add(i);
2410                if is_member(&delete_set, b) {
2411                    i += 1;
2412                    continue;
2413                }
2414                if is_member(&squeeze_set, b) {
2415                    if last_squeezed == b as u16 {
2416                        i += 1;
2417                        continue;
2418                    }
2419                    last_squeezed = b as u16;
2420                } else {
2421                    last_squeezed = 256;
2422                }
2423                *outp.add(out_pos) = b;
2424                out_pos += 1;
2425                i += 1;
2426            }
2427            outbuf.set_len(out_pos);
2428        }
2429        return writer.write_all(&outbuf);
2430    }
2431
2432    // Chunked path for large data
2433    let buf_size = data.len().min(BUF_SIZE);
2434    let mut outbuf = vec![0u8; buf_size];
2435    let mut last_squeezed: u16 = 256;
2436
2437    for chunk in data.chunks(buf_size) {
2438        let mut out_pos = 0;
2439        for &b in chunk {
2440            if is_member(&delete_set, b) {
2441                continue;
2442            }
2443            if is_member(&squeeze_set, b) {
2444                if last_squeezed == b as u16 {
2445                    continue;
2446                }
2447                last_squeezed = b as u16;
2448            } else {
2449                last_squeezed = 256;
2450            }
2451            unsafe {
2452                *outbuf.get_unchecked_mut(out_pos) = b;
2453            }
2454            out_pos += 1;
2455        }
2456        writer.write_all(&outbuf[..out_pos])?;
2457    }
2458    Ok(())
2459}
2460
2461/// Squeeze from mmap'd byte slice.
2462///
2463/// For data >= 2MB: uses rayon parallel processing with boundary fixup.
2464/// For data <= 16MB: squeeze into one buffer, one write syscall.
2465/// For data > 16MB: chunked approach to limit memory.
2466pub fn squeeze_mmap(squeeze_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2467    if squeeze_chars.len() == 1 {
2468        return squeeze_single_mmap(squeeze_chars[0], data, writer);
2469    }
2470    if squeeze_chars.len() == 2 {
2471        return squeeze_multi_mmap::<2>(squeeze_chars, data, writer);
2472    }
2473    if squeeze_chars.len() == 3 {
2474        return squeeze_multi_mmap::<3>(squeeze_chars, data, writer);
2475    }
2476
2477    let member = build_member_set(squeeze_chars);
2478
2479    // Parallel path: squeeze each chunk independently, then fix boundaries
2480    if data.len() >= PARALLEL_THRESHOLD {
2481        let n_threads = rayon::current_num_threads().max(1);
2482        let chunk_size = (data.len() / n_threads).max(32 * 1024);
2483
2484        let results: Vec<Vec<u8>> = data
2485            .par_chunks(chunk_size)
2486            .map(|chunk| squeeze_chunk_bitset(chunk, &member))
2487            .collect();
2488
2489        // Build IoSlice list, fixing boundaries: if chunk N ends with byte B
2490        // and chunk N+1 starts with same byte B, and B is in squeeze set,
2491        // skip the first byte(s) of chunk N+1 that equal B.
2492        // Collect slices for writev to minimize syscalls.
2493        let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
2494        for (idx, result) in results.iter().enumerate() {
2495            if result.is_empty() {
2496                continue;
2497            }
2498            if idx > 0 {
2499                // Check boundary: does previous chunk end with same squeezable byte?
2500                if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
2501                    if is_member(&member, prev_last) {
2502                        // Skip leading bytes in this chunk that equal prev_last
2503                        let skip = result.iter().take_while(|&&b| b == prev_last).count();
2504                        if skip < result.len() {
2505                            slices.push(std::io::IoSlice::new(&result[skip..]));
2506                        }
2507                        continue;
2508                    }
2509                }
2510            }
2511            slices.push(std::io::IoSlice::new(result));
2512        }
2513        return write_ioslices(writer, &slices);
2514    }
2515
2516    // Single-write fast path: squeeze all data into one buffer, one write
2517    if data.len() <= SINGLE_WRITE_LIMIT {
2518        let mut outbuf: Vec<u8> = Vec::with_capacity(data.len());
2519        let mut last_squeezed: u16 = 256;
2520        let len = data.len();
2521        let mut wp = 0;
2522        let mut i = 0;
2523
2524        unsafe {
2525            outbuf.set_len(data.len());
2526            let inp = data.as_ptr();
2527            let outp: *mut u8 = outbuf.as_mut_ptr();
2528
2529            while i < len {
2530                let b = *inp.add(i);
2531                if is_member(&member, b) {
2532                    if last_squeezed != b as u16 {
2533                        *outp.add(wp) = b;
2534                        wp += 1;
2535                        last_squeezed = b as u16;
2536                    }
2537                    i += 1;
2538                    while i < len && *inp.add(i) == b {
2539                        i += 1;
2540                    }
2541                } else {
2542                    last_squeezed = 256;
2543                    *outp.add(wp) = b;
2544                    wp += 1;
2545                    i += 1;
2546                }
2547            }
2548            outbuf.set_len(wp);
2549        }
2550        return writer.write_all(&outbuf);
2551    }
2552
2553    // Chunked path for large data
2554    let buf_size = data.len().min(BUF_SIZE);
2555    let mut outbuf = vec![0u8; buf_size];
2556    let mut last_squeezed: u16 = 256;
2557
2558    for chunk in data.chunks(buf_size) {
2559        let len = chunk.len();
2560        let mut wp = 0;
2561        let mut i = 0;
2562
2563        unsafe {
2564            let inp = chunk.as_ptr();
2565            let outp = outbuf.as_mut_ptr();
2566
2567            while i < len {
2568                let b = *inp.add(i);
2569                if is_member(&member, b) {
2570                    if last_squeezed != b as u16 {
2571                        *outp.add(wp) = b;
2572                        wp += 1;
2573                        last_squeezed = b as u16;
2574                    }
2575                    i += 1;
2576                    while i < len && *inp.add(i) == b {
2577                        i += 1;
2578                    }
2579                } else {
2580                    last_squeezed = 256;
2581                    *outp.add(wp) = b;
2582                    wp += 1;
2583                    i += 1;
2584                }
2585            }
2586        }
2587        writer.write_all(&outbuf[..wp])?;
2588    }
2589    Ok(())
2590}
2591
2592/// Squeeze a single chunk using bitset membership. Returns squeezed output.
2593fn squeeze_chunk_bitset(chunk: &[u8], member: &[u8; 32]) -> Vec<u8> {
2594    let len = chunk.len();
2595    let mut out = Vec::with_capacity(len);
2596    let mut last_squeezed: u16 = 256;
2597    let mut i = 0;
2598
2599    unsafe {
2600        out.set_len(len);
2601        let inp = chunk.as_ptr();
2602        let outp: *mut u8 = out.as_mut_ptr();
2603        let mut wp = 0;
2604
2605        while i < len {
2606            let b = *inp.add(i);
2607            if is_member(member, b) {
2608                if last_squeezed != b as u16 {
2609                    *outp.add(wp) = b;
2610                    wp += 1;
2611                    last_squeezed = b as u16;
2612                }
2613                i += 1;
2614                while i < len && *inp.add(i) == b {
2615                    i += 1;
2616                }
2617            } else {
2618                last_squeezed = 256;
2619                *outp.add(wp) = b;
2620                wp += 1;
2621                i += 1;
2622            }
2623        }
2624        out.set_len(wp);
2625    }
2626    out
2627}
2628
2629fn squeeze_multi_mmap<const N: usize>(
2630    chars: &[u8],
2631    data: &[u8],
2632    writer: &mut impl Write,
2633) -> io::Result<()> {
2634    // Parallel path for large data: squeeze each chunk, fix boundaries with writev
2635    if data.len() >= PARALLEL_THRESHOLD {
2636        let member = build_member_set(chars);
2637        let n_threads = rayon::current_num_threads().max(1);
2638        let chunk_size = (data.len() / n_threads).max(32 * 1024);
2639
2640        let results: Vec<Vec<u8>> = data
2641            .par_chunks(chunk_size)
2642            .map(|chunk| squeeze_chunk_bitset(chunk, &member))
2643            .collect();
2644
2645        // Build IoSlice list, fixing boundaries
2646        let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
2647        for (idx, result) in results.iter().enumerate() {
2648            if result.is_empty() {
2649                continue;
2650            }
2651            if idx > 0 {
2652                if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
2653                    if is_member(&member, prev_last) {
2654                        let skip = result.iter().take_while(|&&b| b == prev_last).count();
2655                        if skip < result.len() {
2656                            slices.push(std::io::IoSlice::new(&result[skip..]));
2657                        }
2658                        continue;
2659                    }
2660                }
2661            }
2662            slices.push(std::io::IoSlice::new(result));
2663        }
2664        return write_ioslices(writer, &slices);
2665    }
2666
2667    let buf_size = data.len().min(BUF_SIZE);
2668    let mut outbuf = vec![0u8; buf_size];
2669    let mut wp = 0;
2670    let mut last_squeezed: u16 = 256;
2671    let mut cursor = 0;
2672
2673    macro_rules! find_next {
2674        ($data:expr) => {
2675            if N == 2 {
2676                memchr::memchr2(chars[0], chars[1], $data)
2677            } else {
2678                memchr::memchr3(chars[0], chars[1], chars[2], $data)
2679            }
2680        };
2681    }
2682
2683    macro_rules! flush_and_copy {
2684        ($src:expr, $len:expr) => {
2685            if wp + $len > buf_size {
2686                writer.write_all(&outbuf[..wp])?;
2687                wp = 0;
2688            }
2689            if $len > buf_size {
2690                writer.write_all($src)?;
2691            } else {
2692                outbuf[wp..wp + $len].copy_from_slice($src);
2693                wp += $len;
2694            }
2695        };
2696    }
2697
2698    while cursor < data.len() {
2699        match find_next!(&data[cursor..]) {
2700            Some(offset) => {
2701                let pos = cursor + offset;
2702                let b = data[pos];
2703                if pos > cursor {
2704                    let span = pos - cursor;
2705                    flush_and_copy!(&data[cursor..pos], span);
2706                    last_squeezed = 256;
2707                }
2708                if last_squeezed != b as u16 {
2709                    if wp >= buf_size {
2710                        writer.write_all(&outbuf[..wp])?;
2711                        wp = 0;
2712                    }
2713                    outbuf[wp] = b;
2714                    wp += 1;
2715                    last_squeezed = b as u16;
2716                }
2717                let mut skip = pos + 1;
2718                while skip < data.len() && data[skip] == b {
2719                    skip += 1;
2720                }
2721                cursor = skip;
2722            }
2723            None => {
2724                let remaining = data.len() - cursor;
2725                flush_and_copy!(&data[cursor..], remaining);
2726                break;
2727            }
2728        }
2729    }
2730    if wp > 0 {
2731        writer.write_all(&outbuf[..wp])?;
2732    }
2733    Ok(())
2734}
2735
2736fn squeeze_single_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2737    if data.is_empty() {
2738        return Ok(());
2739    }
2740
2741    if memchr::memmem::find(data, &[ch, ch]).is_none() {
2742        return writer.write_all(data);
2743    }
2744
2745    // Parallel path: squeeze each chunk, fix boundaries
2746    if data.len() >= PARALLEL_THRESHOLD {
2747        let n_threads = rayon::current_num_threads().max(1);
2748        let chunk_size = (data.len() / n_threads).max(32 * 1024);
2749
2750        let results: Vec<Vec<u8>> = data
2751            .par_chunks(chunk_size)
2752            .map(|chunk| {
2753                let mut out = Vec::with_capacity(chunk.len());
2754                let mut cursor = 0;
2755                while cursor < chunk.len() {
2756                    match memchr::memchr(ch, &chunk[cursor..]) {
2757                        Some(offset) => {
2758                            let pos = cursor + offset;
2759                            if pos > cursor {
2760                                out.extend_from_slice(&chunk[cursor..pos]);
2761                            }
2762                            out.push(ch);
2763                            cursor = pos + 1;
2764                            while cursor < chunk.len() && chunk[cursor] == ch {
2765                                cursor += 1;
2766                            }
2767                        }
2768                        None => {
2769                            out.extend_from_slice(&chunk[cursor..]);
2770                            break;
2771                        }
2772                    }
2773                }
2774                out
2775            })
2776            .collect();
2777
2778        // Build IoSlice list, fixing boundary squeezability.
2779        // Use writev to minimize syscalls.
2780        let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
2781        for (idx, result) in results.iter().enumerate() {
2782            if result.is_empty() {
2783                continue;
2784            }
2785            if idx > 0 {
2786                if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
2787                    if prev_last == ch {
2788                        // Skip leading ch bytes in this chunk result
2789                        let skip = result.iter().take_while(|&&b| b == ch).count();
2790                        if skip < result.len() {
2791                            slices.push(std::io::IoSlice::new(&result[skip..]));
2792                        }
2793                        continue;
2794                    }
2795                }
2796            }
2797            slices.push(std::io::IoSlice::new(result));
2798        }
2799        return write_ioslices(writer, &slices);
2800    }
2801
2802    let buf_size = data.len().min(BUF_SIZE);
2803    let mut outbuf = vec![0u8; buf_size];
2804    let len = data.len();
2805    let mut wp = 0;
2806    let mut cursor = 0;
2807
2808    while cursor < len {
2809        match memchr::memchr(ch, &data[cursor..]) {
2810            Some(offset) => {
2811                let pos = cursor + offset;
2812                let gap = pos - cursor;
2813                if gap > 0 {
2814                    if wp + gap > buf_size {
2815                        writer.write_all(&outbuf[..wp])?;
2816                        wp = 0;
2817                    }
2818                    if gap > buf_size {
2819                        writer.write_all(&data[cursor..pos])?;
2820                    } else {
2821                        outbuf[wp..wp + gap].copy_from_slice(&data[cursor..pos]);
2822                        wp += gap;
2823                    }
2824                }
2825                if wp >= buf_size {
2826                    writer.write_all(&outbuf[..wp])?;
2827                    wp = 0;
2828                }
2829                outbuf[wp] = ch;
2830                wp += 1;
2831                cursor = pos + 1;
2832                while cursor < len && data[cursor] == ch {
2833                    cursor += 1;
2834                }
2835            }
2836            None => {
2837                let remaining = len - cursor;
2838                if remaining > 0 {
2839                    if wp + remaining > buf_size {
2840                        writer.write_all(&outbuf[..wp])?;
2841                        wp = 0;
2842                    }
2843                    if remaining > buf_size {
2844                        writer.write_all(&data[cursor..])?;
2845                    } else {
2846                        outbuf[wp..wp + remaining].copy_from_slice(&data[cursor..]);
2847                        wp += remaining;
2848                    }
2849                }
2850                break;
2851            }
2852        }
2853    }
2854
2855    if wp > 0 {
2856        writer.write_all(&outbuf[..wp])?;
2857    }
2858    Ok(())
2859}