Skip to main content

coreutils_rs/tr/
core.rs

1use std::io::{self, Read, Write};
2
3use rayon::prelude::*;
4
5/// Maximum IoSlice entries per write_vectored batch.
6/// Linux UIO_MAXIOV is 1024; we use that as our batch limit.
7const MAX_IOV: usize = 1024;
8
9/// Main processing buffer: 4MB (fits in L3 cache, avoids cache thrashing).
10const BUF_SIZE: usize = 4 * 1024 * 1024;
11
12/// Stream buffer: 8MB — tr streaming operations (translate, delete, squeeze)
13/// are compute-light (single table lookup or bitset check per byte), so the
14/// bottleneck is I/O syscalls, not cache pressure. 8MB buffer means only
15/// 2 read()/write() syscall pairs for a 10MB input.
16/// This applies to ALL streaming modes (delete, squeeze, translate).
17const STREAM_BUF: usize = 8 * 1024 * 1024;
18
19/// Minimum data size to engage rayon parallel processing for mmap paths.
20/// Below this, single-threaded is faster due to thread pool overhead.
21const PARALLEL_THRESHOLD: usize = 2 * 1024 * 1024;
22
23/// Write multiple IoSlice buffers using write_vectored, batching into MAX_IOV-sized groups.
24/// Falls back to write_all per slice for partial writes.
25#[inline]
26fn write_ioslices(writer: &mut impl Write, slices: &[std::io::IoSlice]) -> io::Result<()> {
27    if slices.is_empty() {
28        return Ok(());
29    }
30    for batch in slices.chunks(MAX_IOV) {
31        let total: usize = batch.iter().map(|s| s.len()).sum();
32        match writer.write_vectored(batch) {
33            Ok(n) if n >= total => continue,
34            Ok(mut written) => {
35                // Partial write: fall back to write_all per remaining slice
36                for slice in batch {
37                    let slen = slice.len();
38                    if written >= slen {
39                        written -= slen;
40                        continue;
41                    }
42                    if written > 0 {
43                        writer.write_all(&slice[written..])?;
44                        written = 0;
45                    } else {
46                        writer.write_all(slice)?;
47                    }
48                }
49            }
50            Err(e) => return Err(e),
51        }
52    }
53    Ok(())
54}
55
56/// Allocate a Vec<u8> of given length without zero-initialization.
57/// SAFETY: Caller must write all bytes before reading them.
58#[inline]
59#[allow(clippy::uninit_vec)]
60fn alloc_uninit_vec(len: usize) -> Vec<u8> {
61    let mut v = Vec::with_capacity(len);
62    // SAFETY: u8 has no drop, no invalid bit patterns; caller will overwrite before reading
63    unsafe {
64        v.set_len(len);
65    }
66    v
67}
68
69/// Build a 256-byte lookup table mapping set1[i] -> set2[i].
70#[inline]
71fn build_translate_table(set1: &[u8], set2: &[u8]) -> [u8; 256] {
72    let mut table: [u8; 256] = std::array::from_fn(|i| i as u8);
73    let last = set2.last().copied();
74    for (i, &from) in set1.iter().enumerate() {
75        table[from as usize] = if i < set2.len() {
76            set2[i]
77        } else {
78            last.unwrap_or(from)
79        };
80    }
81    table
82}
83
84/// Build a 256-bit (32-byte) membership set for O(1) byte lookup.
85#[inline]
86fn build_member_set(chars: &[u8]) -> [u8; 32] {
87    let mut set = [0u8; 32];
88    for &ch in chars {
89        set[ch as usize >> 3] |= 1 << (ch & 7);
90    }
91    set
92}
93
94#[inline(always)]
95fn is_member(set: &[u8; 32], ch: u8) -> bool {
96    unsafe { (*set.get_unchecked(ch as usize >> 3) & (1 << (ch & 7))) != 0 }
97}
98
99/// Translate bytes in-place using a 256-byte lookup table.
100/// On x86_64 with SSSE3+, uses SIMD pshufb-based nibble decomposition for
101/// ~32 bytes per iteration. Falls back to 8x-unrolled scalar on other platforms.
102#[inline(always)]
103fn translate_inplace(data: &mut [u8], table: &[u8; 256]) {
104    #[cfg(target_arch = "x86_64")]
105    {
106        if is_x86_feature_detected!("avx2") {
107            unsafe { translate_inplace_avx2_table(data, table) };
108            return;
109        }
110        if is_x86_feature_detected!("ssse3") {
111            unsafe { translate_inplace_ssse3_table(data, table) };
112            return;
113        }
114    }
115    translate_inplace_scalar(data, table);
116}
117
118/// Scalar fallback: 8x-unrolled table lookup.
119#[inline(always)]
120fn translate_inplace_scalar(data: &mut [u8], table: &[u8; 256]) {
121    let len = data.len();
122    let ptr = data.as_mut_ptr();
123    let mut i = 0;
124    unsafe {
125        while i + 8 <= len {
126            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
127            *ptr.add(i + 1) = *table.get_unchecked(*ptr.add(i + 1) as usize);
128            *ptr.add(i + 2) = *table.get_unchecked(*ptr.add(i + 2) as usize);
129            *ptr.add(i + 3) = *table.get_unchecked(*ptr.add(i + 3) as usize);
130            *ptr.add(i + 4) = *table.get_unchecked(*ptr.add(i + 4) as usize);
131            *ptr.add(i + 5) = *table.get_unchecked(*ptr.add(i + 5) as usize);
132            *ptr.add(i + 6) = *table.get_unchecked(*ptr.add(i + 6) as usize);
133            *ptr.add(i + 7) = *table.get_unchecked(*ptr.add(i + 7) as usize);
134            i += 8;
135        }
136        while i < len {
137            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
138            i += 1;
139        }
140    }
141}
142
143// ============================================================================
144// SIMD arbitrary table lookup using pshufb nibble decomposition (x86_64)
145// ============================================================================
146//
147// For an arbitrary 256-byte lookup table, we decompose each byte into
148// high nibble (bits 7-4) and low nibble (bits 3-0). We pre-build 16
149// SIMD vectors, one for each high nibble value h (0..15), containing
150// the 16 table entries table[h*16+0..h*16+15]. Then for each input
151// vector we:
152//   1. Extract low nibble (AND 0x0F) -> used as pshufb index
153//   2. Extract high nibble (shift right 4) -> used to select which table
154//   3. For each of the 16 high nibble values, create a mask where
155//      the high nibble equals that value, pshufb the corresponding
156//      table, and accumulate results
157//
158// AVX2 processes 32 bytes/iteration; SSSE3 processes 16 bytes/iteration.
159// With instruction-level parallelism, this achieves much higher throughput
160// than scalar table lookups which have serial data dependencies.
161
162#[cfg(target_arch = "x86_64")]
163#[target_feature(enable = "avx2")]
164unsafe fn translate_inplace_avx2_table(data: &mut [u8], table: &[u8; 256]) {
165    use std::arch::x86_64::*;
166
167    unsafe {
168        let len = data.len();
169        let ptr = data.as_mut_ptr();
170
171        // Pre-build 16 lookup vectors, one per high nibble value.
172        // Each vector holds 32 bytes = 2x 128-bit lanes, each lane has the same
173        // 16 table entries for pshufb indexing by low nibble.
174        let mut lut = [_mm256_setzero_si256(); 16];
175        for h in 0u8..16 {
176            let base = (h as usize) * 16;
177            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
178            // Broadcast the 128-bit row to both lanes of the 256-bit vector
179            let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
180            lut[h as usize] = _mm256_broadcastsi128_si256(row128);
181        }
182
183        let lo_mask = _mm256_set1_epi8(0x0F);
184
185        let mut i = 0;
186        while i + 32 <= len {
187            let input = _mm256_loadu_si256(ptr.add(i) as *const _);
188            let lo_nibble = _mm256_and_si256(input, lo_mask);
189            let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
190
191            // Accumulate result: for each high nibble value h, select bytes where
192            // hi_nibble == h, look up in lut[h] using lo_nibble, blend into result.
193            let mut result = _mm256_setzero_si256();
194
195            // Unroll the 16 high-nibble iterations for best ILP
196            macro_rules! do_nibble {
197                ($h:expr) => {
198                    let h_val = _mm256_set1_epi8($h as i8);
199                    let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
200                    let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
201                    result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
202                };
203            }
204            do_nibble!(0);
205            do_nibble!(1);
206            do_nibble!(2);
207            do_nibble!(3);
208            do_nibble!(4);
209            do_nibble!(5);
210            do_nibble!(6);
211            do_nibble!(7);
212            do_nibble!(8);
213            do_nibble!(9);
214            do_nibble!(10);
215            do_nibble!(11);
216            do_nibble!(12);
217            do_nibble!(13);
218            do_nibble!(14);
219            do_nibble!(15);
220
221            _mm256_storeu_si256(ptr.add(i) as *mut _, result);
222            i += 32;
223        }
224
225        // SSE/SSSE3 tail for remaining 16-byte chunk
226        if i + 16 <= len {
227            let lo_mask128 = _mm_set1_epi8(0x0F);
228
229            // Build 128-bit LUTs (just the lower lane of each 256-bit LUT)
230            let mut lut128 = [_mm_setzero_si128(); 16];
231            for h in 0u8..16 {
232                lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
233            }
234
235            let input = _mm_loadu_si128(ptr.add(i) as *const _);
236            let lo_nib = _mm_and_si128(input, lo_mask128);
237            let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
238
239            let mut res = _mm_setzero_si128();
240            macro_rules! do_nibble128 {
241                ($h:expr) => {
242                    let h_val = _mm_set1_epi8($h as i8);
243                    let mask = _mm_cmpeq_epi8(hi_nib, h_val);
244                    let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
245                    res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
246                };
247            }
248            do_nibble128!(0);
249            do_nibble128!(1);
250            do_nibble128!(2);
251            do_nibble128!(3);
252            do_nibble128!(4);
253            do_nibble128!(5);
254            do_nibble128!(6);
255            do_nibble128!(7);
256            do_nibble128!(8);
257            do_nibble128!(9);
258            do_nibble128!(10);
259            do_nibble128!(11);
260            do_nibble128!(12);
261            do_nibble128!(13);
262            do_nibble128!(14);
263            do_nibble128!(15);
264
265            _mm_storeu_si128(ptr.add(i) as *mut _, res);
266            i += 16;
267        }
268
269        // Scalar tail
270        while i < len {
271            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
272            i += 1;
273        }
274    }
275}
276
277#[cfg(target_arch = "x86_64")]
278#[target_feature(enable = "ssse3")]
279unsafe fn translate_inplace_ssse3_table(data: &mut [u8], table: &[u8; 256]) {
280    use std::arch::x86_64::*;
281
282    unsafe {
283        let len = data.len();
284        let ptr = data.as_mut_ptr();
285
286        // Pre-build 16 lookup vectors for pshufb
287        let mut lut = [_mm_setzero_si128(); 16];
288        for h in 0u8..16 {
289            let base = (h as usize) * 16;
290            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
291            lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
292        }
293
294        let lo_mask = _mm_set1_epi8(0x0F);
295
296        let mut i = 0;
297        while i + 16 <= len {
298            let input = _mm_loadu_si128(ptr.add(i) as *const _);
299            let lo_nibble = _mm_and_si128(input, lo_mask);
300            let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
301
302            let mut result = _mm_setzero_si128();
303
304            macro_rules! do_nibble {
305                ($h:expr) => {
306                    let h_val = _mm_set1_epi8($h as i8);
307                    let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
308                    let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
309                    result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
310                };
311            }
312            do_nibble!(0);
313            do_nibble!(1);
314            do_nibble!(2);
315            do_nibble!(3);
316            do_nibble!(4);
317            do_nibble!(5);
318            do_nibble!(6);
319            do_nibble!(7);
320            do_nibble!(8);
321            do_nibble!(9);
322            do_nibble!(10);
323            do_nibble!(11);
324            do_nibble!(12);
325            do_nibble!(13);
326            do_nibble!(14);
327            do_nibble!(15);
328
329            _mm_storeu_si128(ptr.add(i) as *mut _, result);
330            i += 16;
331        }
332
333        // Scalar tail
334        while i < len {
335            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
336            i += 1;
337        }
338    }
339}
340
341/// Translate bytes from source to destination using a 256-byte lookup table.
342/// On x86_64 with SSSE3+, uses SIMD pshufb-based nibble decomposition.
343#[inline(always)]
344fn translate_to(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
345    debug_assert!(dst.len() >= src.len());
346    #[cfg(target_arch = "x86_64")]
347    {
348        if is_x86_feature_detected!("avx2") {
349            unsafe { translate_to_avx2_table(src, dst, table) };
350            return;
351        }
352        if is_x86_feature_detected!("ssse3") {
353            unsafe { translate_to_ssse3_table(src, dst, table) };
354            return;
355        }
356    }
357    translate_to_scalar(src, dst, table);
358}
359
360/// Scalar fallback for translate_to.
361#[inline(always)]
362fn translate_to_scalar(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
363    unsafe {
364        let sp = src.as_ptr();
365        let dp = dst.as_mut_ptr();
366        let len = src.len();
367        let mut i = 0;
368        while i + 8 <= len {
369            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
370            *dp.add(i + 1) = *table.get_unchecked(*sp.add(i + 1) as usize);
371            *dp.add(i + 2) = *table.get_unchecked(*sp.add(i + 2) as usize);
372            *dp.add(i + 3) = *table.get_unchecked(*sp.add(i + 3) as usize);
373            *dp.add(i + 4) = *table.get_unchecked(*sp.add(i + 4) as usize);
374            *dp.add(i + 5) = *table.get_unchecked(*sp.add(i + 5) as usize);
375            *dp.add(i + 6) = *table.get_unchecked(*sp.add(i + 6) as usize);
376            *dp.add(i + 7) = *table.get_unchecked(*sp.add(i + 7) as usize);
377            i += 8;
378        }
379        while i < len {
380            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
381            i += 1;
382        }
383    }
384}
385
386#[cfg(target_arch = "x86_64")]
387#[target_feature(enable = "avx2")]
388unsafe fn translate_to_avx2_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
389    use std::arch::x86_64::*;
390
391    unsafe {
392        let len = src.len();
393        let sp = src.as_ptr();
394        let dp = dst.as_mut_ptr();
395
396        // Pre-build 16 lookup vectors
397        let mut lut = [_mm256_setzero_si256(); 16];
398        for h in 0u8..16 {
399            let base = (h as usize) * 16;
400            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
401            let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
402            lut[h as usize] = _mm256_broadcastsi128_si256(row128);
403        }
404
405        let lo_mask = _mm256_set1_epi8(0x0F);
406
407        let mut i = 0;
408        while i + 32 <= len {
409            let input = _mm256_loadu_si256(sp.add(i) as *const _);
410            let lo_nibble = _mm256_and_si256(input, lo_mask);
411            let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
412
413            let mut result = _mm256_setzero_si256();
414
415            macro_rules! do_nibble {
416                ($h:expr) => {
417                    let h_val = _mm256_set1_epi8($h as i8);
418                    let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
419                    let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
420                    result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
421                };
422            }
423            do_nibble!(0);
424            do_nibble!(1);
425            do_nibble!(2);
426            do_nibble!(3);
427            do_nibble!(4);
428            do_nibble!(5);
429            do_nibble!(6);
430            do_nibble!(7);
431            do_nibble!(8);
432            do_nibble!(9);
433            do_nibble!(10);
434            do_nibble!(11);
435            do_nibble!(12);
436            do_nibble!(13);
437            do_nibble!(14);
438            do_nibble!(15);
439
440            _mm256_storeu_si256(dp.add(i) as *mut _, result);
441            i += 32;
442        }
443
444        // SSSE3 tail for remaining 16-byte chunk
445        if i + 16 <= len {
446            let lo_mask128 = _mm_set1_epi8(0x0F);
447            let mut lut128 = [_mm_setzero_si128(); 16];
448            for h in 0u8..16 {
449                lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
450            }
451
452            let input = _mm_loadu_si128(sp.add(i) as *const _);
453            let lo_nib = _mm_and_si128(input, lo_mask128);
454            let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
455
456            let mut res = _mm_setzero_si128();
457            macro_rules! do_nibble128 {
458                ($h:expr) => {
459                    let h_val = _mm_set1_epi8($h as i8);
460                    let mask = _mm_cmpeq_epi8(hi_nib, h_val);
461                    let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
462                    res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
463                };
464            }
465            do_nibble128!(0);
466            do_nibble128!(1);
467            do_nibble128!(2);
468            do_nibble128!(3);
469            do_nibble128!(4);
470            do_nibble128!(5);
471            do_nibble128!(6);
472            do_nibble128!(7);
473            do_nibble128!(8);
474            do_nibble128!(9);
475            do_nibble128!(10);
476            do_nibble128!(11);
477            do_nibble128!(12);
478            do_nibble128!(13);
479            do_nibble128!(14);
480            do_nibble128!(15);
481
482            _mm_storeu_si128(dp.add(i) as *mut _, res);
483            i += 16;
484        }
485
486        // Scalar tail
487        while i < len {
488            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
489            i += 1;
490        }
491    }
492}
493
494#[cfg(target_arch = "x86_64")]
495#[target_feature(enable = "ssse3")]
496unsafe fn translate_to_ssse3_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
497    use std::arch::x86_64::*;
498
499    unsafe {
500        let len = src.len();
501        let sp = src.as_ptr();
502        let dp = dst.as_mut_ptr();
503
504        let mut lut = [_mm_setzero_si128(); 16];
505        for h in 0u8..16 {
506            let base = (h as usize) * 16;
507            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
508            lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
509        }
510
511        let lo_mask = _mm_set1_epi8(0x0F);
512
513        let mut i = 0;
514        while i + 16 <= len {
515            let input = _mm_loadu_si128(sp.add(i) as *const _);
516            let lo_nibble = _mm_and_si128(input, lo_mask);
517            let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
518
519            let mut result = _mm_setzero_si128();
520
521            macro_rules! do_nibble {
522                ($h:expr) => {
523                    let h_val = _mm_set1_epi8($h as i8);
524                    let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
525                    let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
526                    result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
527                };
528            }
529            do_nibble!(0);
530            do_nibble!(1);
531            do_nibble!(2);
532            do_nibble!(3);
533            do_nibble!(4);
534            do_nibble!(5);
535            do_nibble!(6);
536            do_nibble!(7);
537            do_nibble!(8);
538            do_nibble!(9);
539            do_nibble!(10);
540            do_nibble!(11);
541            do_nibble!(12);
542            do_nibble!(13);
543            do_nibble!(14);
544            do_nibble!(15);
545
546            _mm_storeu_si128(dp.add(i) as *mut _, result);
547            i += 16;
548        }
549
550        // Scalar tail
551        while i < len {
552            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
553            i += 1;
554        }
555    }
556}
557
558// ============================================================================
559// SIMD range translation (x86_64)
560// ============================================================================
561
562/// Detect if the translate table is a single contiguous range with constant offset.
563/// Returns Some((lo, hi, offset)) if all non-identity entries form [lo..=hi] with
564/// table[i] = i + offset for all i in [lo, hi].
565#[inline]
566fn detect_range_offset(table: &[u8; 256]) -> Option<(u8, u8, i8)> {
567    let mut lo: Option<u8> = None;
568    let mut hi = 0u8;
569    let mut offset = 0i16;
570
571    for i in 0..256 {
572        if table[i] != i as u8 {
573            let diff = table[i] as i16 - i as i16;
574            match lo {
575                None => {
576                    lo = Some(i as u8);
577                    hi = i as u8;
578                    offset = diff;
579                }
580                Some(_) => {
581                    if diff != offset || i as u8 != hi.wrapping_add(1) {
582                        return None;
583                    }
584                    hi = i as u8;
585                }
586            }
587        }
588    }
589
590    lo.map(|l| (l, hi, offset as i8))
591}
592
593/// SIMD-accelerated range translation for mmap'd data.
594/// For tables where only a contiguous range [lo..=hi] is translated by a constant offset,
595/// uses AVX2 (32 bytes/iter) or SSE2 (16 bytes/iter) vectorized arithmetic.
596#[cfg(target_arch = "x86_64")]
597fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
598    if is_x86_feature_detected!("avx2") {
599        unsafe { translate_range_avx2(src, dst, lo, hi, offset) };
600    } else {
601        unsafe { translate_range_sse2(src, dst, lo, hi, offset) };
602    }
603}
604
605#[cfg(target_arch = "x86_64")]
606#[target_feature(enable = "avx2")]
607unsafe fn translate_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
608    use std::arch::x86_64::*;
609
610    unsafe {
611        let range = hi - lo;
612        // Bias: shift range so lo maps to -128 (signed min).
613        // For input in [lo, hi]: biased = input + (0x80 - lo) is in [-128, -128+range].
614        // For input < lo: biased wraps to large positive (signed), > threshold.
615        // For input > hi: biased > -128+range, > threshold.
616        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
617        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
618        let offset_v = _mm256_set1_epi8(offset);
619        let zero = _mm256_setzero_si256();
620
621        let len = src.len();
622        let mut i = 0;
623
624        while i + 32 <= len {
625            let input = _mm256_loadu_si256(src.as_ptr().add(i) as *const _);
626            let biased = _mm256_add_epi8(input, bias_v);
627            // gt = 0xFF where biased > threshold (OUT of range)
628            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
629            // mask = 0xFF where IN range (NOT gt)
630            let mask = _mm256_cmpeq_epi8(gt, zero);
631            let offset_masked = _mm256_and_si256(mask, offset_v);
632            let result = _mm256_add_epi8(input, offset_masked);
633            _mm256_storeu_si256(dst.as_mut_ptr().add(i) as *mut _, result);
634            i += 32;
635        }
636
637        // SSE2 tail for 16-byte remainder
638        if i + 16 <= len {
639            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
640            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
641            let offset_v128 = _mm_set1_epi8(offset);
642            let zero128 = _mm_setzero_si128();
643
644            let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
645            let biased = _mm_add_epi8(input, bias_v128);
646            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
647            let mask = _mm_cmpeq_epi8(gt, zero128);
648            let offset_masked = _mm_and_si128(mask, offset_v128);
649            let result = _mm_add_epi8(input, offset_masked);
650            _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
651            i += 16;
652        }
653
654        // Scalar tail
655        while i < len {
656            let b = *src.get_unchecked(i);
657            *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
658                b.wrapping_add(offset as u8)
659            } else {
660                b
661            };
662            i += 1;
663        }
664    }
665}
666
667#[cfg(target_arch = "x86_64")]
668#[target_feature(enable = "sse2")]
669unsafe fn translate_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
670    use std::arch::x86_64::*;
671
672    unsafe {
673        let range = hi - lo;
674        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
675        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
676        let offset_v = _mm_set1_epi8(offset);
677        let zero = _mm_setzero_si128();
678
679        let len = src.len();
680        let mut i = 0;
681
682        while i + 16 <= len {
683            let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
684            let biased = _mm_add_epi8(input, bias_v);
685            let gt = _mm_cmpgt_epi8(biased, threshold_v);
686            let mask = _mm_cmpeq_epi8(gt, zero);
687            let offset_masked = _mm_and_si128(mask, offset_v);
688            let result = _mm_add_epi8(input, offset_masked);
689            _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
690            i += 16;
691        }
692
693        while i < len {
694            let b = *src.get_unchecked(i);
695            *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
696                b.wrapping_add(offset as u8)
697            } else {
698                b
699            };
700            i += 1;
701        }
702    }
703}
704
705/// Scalar range translation fallback for non-x86_64.
706#[cfg(not(target_arch = "x86_64"))]
707fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
708    for (i, &b) in src.iter().enumerate() {
709        dst[i] = if b >= lo && b <= hi {
710            b.wrapping_add(offset as u8)
711        } else {
712            b
713        };
714    }
715}
716
717// ============================================================================
718// In-place SIMD range translation (saves one buffer allocation in streaming)
719// ============================================================================
720
721/// In-place SIMD-accelerated range translation.
722/// Translates bytes in [lo..=hi] by adding `offset`, leaving others unchanged.
723/// Operates on the buffer in-place, eliminating the need for a separate output buffer.
724#[cfg(target_arch = "x86_64")]
725fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
726    if is_x86_feature_detected!("avx2") {
727        unsafe { translate_range_avx2_inplace(data, lo, hi, offset) };
728    } else {
729        unsafe { translate_range_sse2_inplace(data, lo, hi, offset) };
730    }
731}
732
733#[cfg(target_arch = "x86_64")]
734#[target_feature(enable = "avx2")]
735unsafe fn translate_range_avx2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
736    use std::arch::x86_64::*;
737
738    unsafe {
739        let range = hi - lo;
740        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
741        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
742        let offset_v = _mm256_set1_epi8(offset);
743        let zero = _mm256_setzero_si256();
744
745        let len = data.len();
746        let ptr = data.as_mut_ptr();
747        let mut i = 0;
748
749        while i + 32 <= len {
750            let input = _mm256_loadu_si256(ptr.add(i) as *const _);
751            let biased = _mm256_add_epi8(input, bias_v);
752            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
753            let mask = _mm256_cmpeq_epi8(gt, zero);
754            let offset_masked = _mm256_and_si256(mask, offset_v);
755            let result = _mm256_add_epi8(input, offset_masked);
756            _mm256_storeu_si256(ptr.add(i) as *mut _, result);
757            i += 32;
758        }
759
760        if i + 16 <= len {
761            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
762            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
763            let offset_v128 = _mm_set1_epi8(offset);
764            let zero128 = _mm_setzero_si128();
765
766            let input = _mm_loadu_si128(ptr.add(i) as *const _);
767            let biased = _mm_add_epi8(input, bias_v128);
768            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
769            let mask = _mm_cmpeq_epi8(gt, zero128);
770            let offset_masked = _mm_and_si128(mask, offset_v128);
771            let result = _mm_add_epi8(input, offset_masked);
772            _mm_storeu_si128(ptr.add(i) as *mut _, result);
773            i += 16;
774        }
775
776        while i < len {
777            let b = *ptr.add(i);
778            *ptr.add(i) = if b >= lo && b <= hi {
779                b.wrapping_add(offset as u8)
780            } else {
781                b
782            };
783            i += 1;
784        }
785    }
786}
787
788#[cfg(target_arch = "x86_64")]
789#[target_feature(enable = "sse2")]
790unsafe fn translate_range_sse2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
791    use std::arch::x86_64::*;
792
793    unsafe {
794        let range = hi - lo;
795        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
796        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
797        let offset_v = _mm_set1_epi8(offset);
798        let zero = _mm_setzero_si128();
799
800        let len = data.len();
801        let ptr = data.as_mut_ptr();
802        let mut i = 0;
803
804        while i + 16 <= len {
805            let input = _mm_loadu_si128(ptr.add(i) as *const _);
806            let biased = _mm_add_epi8(input, bias_v);
807            let gt = _mm_cmpgt_epi8(biased, threshold_v);
808            let mask = _mm_cmpeq_epi8(gt, zero);
809            let offset_masked = _mm_and_si128(mask, offset_v);
810            let result = _mm_add_epi8(input, offset_masked);
811            _mm_storeu_si128(ptr.add(i) as *mut _, result);
812            i += 16;
813        }
814
815        while i < len {
816            let b = *ptr.add(i);
817            *ptr.add(i) = if b >= lo && b <= hi {
818                b.wrapping_add(offset as u8)
819            } else {
820                b
821            };
822            i += 1;
823        }
824    }
825}
826
827#[cfg(not(target_arch = "x86_64"))]
828fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
829    for b in data.iter_mut() {
830        if *b >= lo && *b <= hi {
831            *b = b.wrapping_add(offset as u8);
832        }
833    }
834}
835
836// ============================================================================
837// SIMD range deletion (x86_64)
838// ============================================================================
839
840/// Detect if ALL delete characters form a single contiguous byte range [lo..=hi].
841/// Returns Some((lo, hi)) if so. This is true for common classes:
842/// - `[:digit:]` = 0x30..=0x39
843/// - `a-z` = 0x61..=0x7A
844/// - `A-Z` = 0x41..=0x5A
845#[inline]
846fn detect_delete_range(chars: &[u8]) -> Option<(u8, u8)> {
847    if chars.is_empty() {
848        return None;
849    }
850    let mut lo = chars[0];
851    let mut hi = chars[0];
852    for &c in &chars[1..] {
853        if c < lo {
854            lo = c;
855        }
856        if c > hi {
857            hi = c;
858        }
859    }
860    // Check that the range size matches the number of chars (no gaps)
861    if (hi - lo + 1) as usize == chars.len() {
862        Some((lo, hi))
863    } else {
864        None
865    }
866}
867
868/// SIMD-accelerated delete for contiguous byte ranges.
869/// Uses the same bias+threshold trick as range translate to identify bytes in [lo..=hi],
870/// then compacts output by skipping matched bytes.
871#[cfg(target_arch = "x86_64")]
872fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
873    if is_x86_feature_detected!("avx2") {
874        unsafe { delete_range_avx2(src, dst, lo, hi) }
875    } else {
876        unsafe { delete_range_sse2(src, dst, lo, hi) }
877    }
878}
879
880#[cfg(target_arch = "x86_64")]
881#[target_feature(enable = "avx2")]
882unsafe fn delete_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
883    use std::arch::x86_64::*;
884
885    unsafe {
886        let range = hi - lo;
887        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
888        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
889        let zero = _mm256_setzero_si256();
890
891        let len = src.len();
892        let sp = src.as_ptr();
893        let dp = dst.as_mut_ptr();
894        let mut ri = 0;
895        let mut wp = 0;
896
897        while ri + 32 <= len {
898            let input = _mm256_loadu_si256(sp.add(ri) as *const _);
899            let biased = _mm256_add_epi8(input, bias_v);
900            // gt = 0xFF where biased > threshold (OUT of range = KEEP)
901            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
902            // in_range = 0xFF where IN range (to DELETE), 0 where to KEEP
903            let in_range = _mm256_cmpeq_epi8(gt, zero);
904            // keep_mask bits: 1 = keep (NOT in range)
905            let keep_mask = !(_mm256_movemask_epi8(in_range) as u32);
906
907            if keep_mask == 0xFFFFFFFF {
908                // All 32 bytes are kept — bulk copy
909                std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 32);
910                wp += 32;
911            } else if keep_mask != 0 {
912                // Partial keep — process each 8-byte lane with popcnt
913                compact_8bytes(sp.add(ri), dp.add(wp), keep_mask as u8);
914                let c0 = (keep_mask as u8).count_ones() as usize;
915                compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), (keep_mask >> 8) as u8);
916                let c1 = ((keep_mask >> 8) as u8).count_ones() as usize;
917                compact_8bytes(
918                    sp.add(ri + 16),
919                    dp.add(wp + c0 + c1),
920                    (keep_mask >> 16) as u8,
921                );
922                let c2 = ((keep_mask >> 16) as u8).count_ones() as usize;
923                compact_8bytes(
924                    sp.add(ri + 24),
925                    dp.add(wp + c0 + c1 + c2),
926                    (keep_mask >> 24) as u8,
927                );
928                let c3 = ((keep_mask >> 24) as u8).count_ones() as usize;
929                wp += c0 + c1 + c2 + c3;
930            }
931            // else: keep_mask == 0 means all bytes deleted, skip entirely
932            ri += 32;
933        }
934
935        // SSE2 tail for 16-byte remainder
936        if ri + 16 <= len {
937            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
938            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
939            let zero128 = _mm_setzero_si128();
940
941            let input = _mm_loadu_si128(sp.add(ri) as *const _);
942            let biased = _mm_add_epi8(input, bias_v128);
943            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
944            let in_range = _mm_cmpeq_epi8(gt, zero128);
945            let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
946
947            if keep_mask == 0xFFFF {
948                std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 16);
949                wp += 16;
950            } else if keep_mask != 0 {
951                compact_8bytes(sp.add(ri), dp.add(wp), keep_mask as u8);
952                let c0 = (keep_mask as u8).count_ones() as usize;
953                compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), (keep_mask >> 8) as u8);
954                wp += c0 + ((keep_mask >> 8) as u8).count_ones() as usize;
955            }
956            ri += 16;
957        }
958
959        // Scalar tail
960        while ri < len {
961            let b = *sp.add(ri);
962            if b < lo || b > hi {
963                *dp.add(wp) = b;
964                wp += 1;
965            }
966            ri += 1;
967        }
968
969        wp
970    }
971}
972
973/// Compact 8 source bytes into contiguous output bytes using a keep mask.
974/// Each bit in `mask` indicates whether the corresponding byte should be kept.
975/// Uses branchless writes: always writes 8 bytes (the extras are harmless since
976/// the caller tracks the write pointer by popcount).
977#[cfg(target_arch = "x86_64")]
978#[inline(always)]
979unsafe fn compact_8bytes(src: *const u8, dst: *mut u8, mask: u8) {
980    // For each set bit position in the mask, copy that byte from src to the
981    // next output position. Using a tight trailing_zeros loop is faster than
982    // a lookup table for 8-bit masks because it's branch-predicted well
983    // and avoids cache misses on the table.
984    unsafe {
985        let mut m = mask;
986        let mut w = 0;
987        while m != 0 {
988            let bit = m.trailing_zeros() as usize;
989            *dst.add(w) = *src.add(bit);
990            w += 1;
991            m &= m - 1;
992        }
993    }
994}
995
996#[cfg(target_arch = "x86_64")]
997#[target_feature(enable = "sse2")]
998unsafe fn delete_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
999    use std::arch::x86_64::*;
1000
1001    unsafe {
1002        let range = hi - lo;
1003        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1004        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1005        let zero = _mm_setzero_si128();
1006
1007        let len = src.len();
1008        let sp = src.as_ptr();
1009        let dp = dst.as_mut_ptr();
1010        let mut ri = 0;
1011        let mut wp = 0;
1012
1013        while ri + 16 <= len {
1014            let input = _mm_loadu_si128(sp.add(ri) as *const _);
1015            let biased = _mm_add_epi8(input, bias_v);
1016            let gt = _mm_cmpgt_epi8(biased, threshold_v);
1017            let in_range = _mm_cmpeq_epi8(gt, zero);
1018            let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
1019
1020            if keep_mask == 0xFFFF {
1021                // All 16 bytes kept — bulk copy
1022                std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 16);
1023                wp += 16;
1024            } else if keep_mask != 0 {
1025                compact_8bytes(sp.add(ri), dp.add(wp), keep_mask as u8);
1026                let c0 = (keep_mask as u8).count_ones() as usize;
1027                compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), (keep_mask >> 8) as u8);
1028                wp += c0 + ((keep_mask >> 8) as u8).count_ones() as usize;
1029            }
1030            ri += 16;
1031        }
1032
1033        while ri < len {
1034            let b = *sp.add(ri);
1035            if b < lo || b > hi {
1036                *dp.add(wp) = b;
1037                wp += 1;
1038            }
1039            ri += 1;
1040        }
1041
1042        wp
1043    }
1044}
1045
1046/// Scalar range delete fallback for non-x86_64.
1047#[cfg(not(target_arch = "x86_64"))]
1048fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
1049    let mut wp = 0;
1050    for &b in src {
1051        if b < lo || b > hi {
1052            dst[wp] = b;
1053            wp += 1;
1054        }
1055    }
1056    wp
1057}
1058
1059/// Streaming delete for contiguous byte ranges using SIMD range detection.
1060/// Uses 4MB buffer to reduce syscalls (delete is compute-light, I/O bound).
1061/// When no bytes are deleted from a chunk (common for data with few matches),
1062/// writes directly from the source buffer to avoid the copy overhead.
1063fn delete_range_streaming(
1064    lo: u8,
1065    hi: u8,
1066    reader: &mut impl Read,
1067    writer: &mut impl Write,
1068) -> io::Result<()> {
1069    let mut src = vec![0u8; STREAM_BUF];
1070    let mut dst = alloc_uninit_vec(STREAM_BUF);
1071    loop {
1072        let n = read_full(reader, &mut src)?;
1073        if n == 0 {
1074            break;
1075        }
1076        let wp = delete_range_chunk(&src[..n], &mut dst, lo, hi);
1077        if wp == n {
1078            // No bytes deleted — write source directly (avoids copy overhead)
1079            writer.write_all(&src[..n])?;
1080        } else if wp > 0 {
1081            writer.write_all(&dst[..wp])?;
1082        }
1083    }
1084    Ok(())
1085}
1086
1087// ============================================================================
1088// Streaming functions (Read + Write)
1089// ============================================================================
1090
1091pub fn translate(
1092    set1: &[u8],
1093    set2: &[u8],
1094    reader: &mut impl Read,
1095    writer: &mut impl Write,
1096) -> io::Result<()> {
1097    let table = build_translate_table(set1, set2);
1098
1099    // Check for identity table — pure passthrough (no transformation needed)
1100    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
1101    if is_identity {
1102        return passthrough_stream(reader, writer);
1103    }
1104
1105    // Try SIMD fast path for range translations (in-place, single buffer)
1106    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
1107        return translate_range_stream(lo, hi, offset, reader, writer);
1108    }
1109
1110    // General case: IN-PLACE translation on a SINGLE 4MB buffer.
1111    // This halves memory bandwidth vs the old separate src/dst approach:
1112    // - Old: read into src (4MB), translate from src→dst (read 4MB + write 4MB), write dst (4MB) = 12MB
1113    // - New: read into buf (4MB), translate in-place (read+write 4MB), write buf (4MB) = 8MB
1114    // The 8x-unrolled in-place translate avoids store-to-load forwarding stalls
1115    // because consecutive reads are 8 bytes apart (sequential), not aliased.
1116    // Using 4MB buffer (vs 1MB) reduces syscall count from 10 to 3 for 10MB.
1117    let mut buf = vec![0u8; STREAM_BUF];
1118    loop {
1119        let n = read_full(reader, &mut buf)?;
1120        if n == 0 {
1121            break;
1122        }
1123        translate_inplace(&mut buf[..n], &table);
1124        writer.write_all(&buf[..n])?;
1125    }
1126    Ok(())
1127}
1128
1129/// Streaming SIMD range translation — single buffer, in-place transform.
1130/// Uses 4MB buffer for fewer syscalls (translate is compute-light).
1131fn translate_range_stream(
1132    lo: u8,
1133    hi: u8,
1134    offset: i8,
1135    reader: &mut impl Read,
1136    writer: &mut impl Write,
1137) -> io::Result<()> {
1138    let mut buf = vec![0u8; STREAM_BUF];
1139    loop {
1140        let n = read_full(reader, &mut buf)?;
1141        if n == 0 {
1142            break;
1143        }
1144        translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
1145        writer.write_all(&buf[..n])?;
1146    }
1147    Ok(())
1148}
1149
1150/// Pure passthrough: copy stdin to stdout without transformation.
1151/// Uses a single 4MB buffer with direct read/write, no processing overhead.
1152fn passthrough_stream(reader: &mut impl Read, writer: &mut impl Write) -> io::Result<()> {
1153    let mut buf = vec![0u8; STREAM_BUF];
1154    loop {
1155        let n = read_full(reader, &mut buf)?;
1156        if n == 0 {
1157            break;
1158        }
1159        writer.write_all(&buf[..n])?;
1160    }
1161    Ok(())
1162}
1163
1164/// Read as many bytes as possible into buf, retrying on partial reads.
1165/// Fast path: first read() often fills the entire buffer for regular files.
1166#[inline]
1167fn read_full(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
1168    // Fast path: first read() usually fills the entire buffer for regular files
1169    let n = reader.read(buf)?;
1170    if n == buf.len() || n == 0 {
1171        return Ok(n);
1172    }
1173    // Slow path: partial read — retry to fill buffer (pipes, slow devices)
1174    let mut total = n;
1175    while total < buf.len() {
1176        match reader.read(&mut buf[total..]) {
1177            Ok(0) => break,
1178            Ok(n) => total += n,
1179            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
1180            Err(e) => return Err(e),
1181        }
1182    }
1183    Ok(total)
1184}
1185
1186pub fn translate_squeeze(
1187    set1: &[u8],
1188    set2: &[u8],
1189    reader: &mut impl Read,
1190    writer: &mut impl Write,
1191) -> io::Result<()> {
1192    let table = build_translate_table(set1, set2);
1193    let squeeze_set = build_member_set(set2);
1194
1195    // Two-pass optimization for range translations:
1196    // Pass 1: SIMD range translate in-place (10x faster than scalar table lookup)
1197    // Pass 2: scalar squeeze (inherently sequential due to state dependency)
1198    // Even though it's two passes, the translate pass is so much faster with SIMD
1199    // that the total is still a net win.
1200    let range_info = detect_range_offset(&table);
1201
1202    let mut buf = vec![0u8; STREAM_BUF];
1203    let mut last_squeezed: u16 = 256;
1204
1205    loop {
1206        let n = read_full(reader, &mut buf)?;
1207        if n == 0 {
1208            break;
1209        }
1210        // Pass 1: translate
1211        if let Some((lo, hi, offset)) = range_info {
1212            translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
1213        } else {
1214            translate_inplace(&mut buf[..n], &table);
1215        }
1216        // Pass 2: squeeze in-place
1217        let mut wp = 0;
1218        unsafe {
1219            let ptr = buf.as_mut_ptr();
1220            for i in 0..n {
1221                let b = *ptr.add(i);
1222                if is_member(&squeeze_set, b) {
1223                    if last_squeezed == b as u16 {
1224                        continue;
1225                    }
1226                    last_squeezed = b as u16;
1227                } else {
1228                    last_squeezed = 256;
1229                }
1230                *ptr.add(wp) = b;
1231                wp += 1;
1232            }
1233        }
1234        writer.write_all(&buf[..wp])?;
1235    }
1236    Ok(())
1237}
1238
1239pub fn delete(
1240    delete_chars: &[u8],
1241    reader: &mut impl Read,
1242    writer: &mut impl Write,
1243) -> io::Result<()> {
1244    if delete_chars.len() == 1 {
1245        return delete_single_streaming(delete_chars[0], reader, writer);
1246    }
1247    if delete_chars.len() <= 3 {
1248        return delete_multi_streaming(delete_chars, reader, writer);
1249    }
1250
1251    // SIMD fast path: if all delete chars form a contiguous range [lo..=hi],
1252    // use vectorized range comparison instead of scalar bitset lookup.
1253    // This covers [:digit:] (0x30-0x39), a-z, A-Z, etc.
1254    if let Some((lo, hi)) = detect_delete_range(delete_chars) {
1255        return delete_range_streaming(lo, hi, reader, writer);
1256    }
1257
1258    let member = build_member_set(delete_chars);
1259    let mut buf = vec![0u8; STREAM_BUF];
1260
1261    loop {
1262        let n = read_full(reader, &mut buf)?;
1263        if n == 0 {
1264            break;
1265        }
1266        let mut wp = 0;
1267        unsafe {
1268            let ptr = buf.as_mut_ptr();
1269            let mut i = 0;
1270            while i + 8 <= n {
1271                let b0 = *ptr.add(i);
1272                let b1 = *ptr.add(i + 1);
1273                let b2 = *ptr.add(i + 2);
1274                let b3 = *ptr.add(i + 3);
1275                let b4 = *ptr.add(i + 4);
1276                let b5 = *ptr.add(i + 5);
1277                let b6 = *ptr.add(i + 6);
1278                let b7 = *ptr.add(i + 7);
1279
1280                // Branchless: write byte then conditionally advance pointer.
1281                // Avoids branch mispredictions when most bytes are kept.
1282                *ptr.add(wp) = b0;
1283                wp += !is_member(&member, b0) as usize;
1284                *ptr.add(wp) = b1;
1285                wp += !is_member(&member, b1) as usize;
1286                *ptr.add(wp) = b2;
1287                wp += !is_member(&member, b2) as usize;
1288                *ptr.add(wp) = b3;
1289                wp += !is_member(&member, b3) as usize;
1290                *ptr.add(wp) = b4;
1291                wp += !is_member(&member, b4) as usize;
1292                *ptr.add(wp) = b5;
1293                wp += !is_member(&member, b5) as usize;
1294                *ptr.add(wp) = b6;
1295                wp += !is_member(&member, b6) as usize;
1296                *ptr.add(wp) = b7;
1297                wp += !is_member(&member, b7) as usize;
1298                i += 8;
1299            }
1300            while i < n {
1301                let b = *ptr.add(i);
1302                *ptr.add(wp) = b;
1303                wp += !is_member(&member, b) as usize;
1304                i += 1;
1305            }
1306        }
1307        writer.write_all(&buf[..wp])?;
1308    }
1309    Ok(())
1310}
1311
1312fn delete_single_streaming(
1313    ch: u8,
1314    reader: &mut impl Read,
1315    writer: &mut impl Write,
1316) -> io::Result<()> {
1317    let mut src = vec![0u8; STREAM_BUF];
1318    let mut dst = alloc_uninit_vec(STREAM_BUF);
1319    loop {
1320        let n = read_full(reader, &mut src)?;
1321        if n == 0 {
1322            break;
1323        }
1324        // Use memchr to find byte positions, gap-copy to separate dst buffer.
1325        // Separate src/dst allows copy_nonoverlapping (faster than memmove)
1326        // and avoids aliasing concerns in the hot loop.
1327        let mut wp = 0;
1328        let mut i = 0;
1329        while i < n {
1330            match memchr::memchr(ch, &src[i..n]) {
1331                Some(offset) => {
1332                    if offset > 0 {
1333                        unsafe {
1334                            std::ptr::copy_nonoverlapping(
1335                                src.as_ptr().add(i),
1336                                dst.as_mut_ptr().add(wp),
1337                                offset,
1338                            );
1339                        }
1340                        wp += offset;
1341                    }
1342                    i += offset + 1;
1343                }
1344                None => {
1345                    let run_len = n - i;
1346                    if run_len > 0 {
1347                        unsafe {
1348                            std::ptr::copy_nonoverlapping(
1349                                src.as_ptr().add(i),
1350                                dst.as_mut_ptr().add(wp),
1351                                run_len,
1352                            );
1353                        }
1354                        wp += run_len;
1355                    }
1356                    break;
1357                }
1358            }
1359        }
1360        // If nothing was deleted, write from src directly (avoids extra copy)
1361        if wp == n {
1362            writer.write_all(&src[..n])?;
1363        } else if wp > 0 {
1364            writer.write_all(&dst[..wp])?;
1365        }
1366    }
1367    Ok(())
1368}
1369
1370fn delete_multi_streaming(
1371    chars: &[u8],
1372    reader: &mut impl Read,
1373    writer: &mut impl Write,
1374) -> io::Result<()> {
1375    let mut src = vec![0u8; STREAM_BUF];
1376    let mut dst = alloc_uninit_vec(STREAM_BUF);
1377    loop {
1378        let n = read_full(reader, &mut src)?;
1379        if n == 0 {
1380            break;
1381        }
1382        // Use memchr2/memchr3 to find byte positions, gap-copy to separate dst buffer.
1383        // Separate src/dst allows copy_nonoverlapping (faster than memmove).
1384        let mut wp = 0;
1385        let mut i = 0;
1386        while i < n {
1387            let found = if chars.len() == 2 {
1388                memchr::memchr2(chars[0], chars[1], &src[i..n])
1389            } else {
1390                memchr::memchr3(chars[0], chars[1], chars[2], &src[i..n])
1391            };
1392            match found {
1393                Some(offset) => {
1394                    if offset > 0 {
1395                        unsafe {
1396                            std::ptr::copy_nonoverlapping(
1397                                src.as_ptr().add(i),
1398                                dst.as_mut_ptr().add(wp),
1399                                offset,
1400                            );
1401                        }
1402                        wp += offset;
1403                    }
1404                    i += offset + 1;
1405                }
1406                None => {
1407                    let run_len = n - i;
1408                    if run_len > 0 {
1409                        unsafe {
1410                            std::ptr::copy_nonoverlapping(
1411                                src.as_ptr().add(i),
1412                                dst.as_mut_ptr().add(wp),
1413                                run_len,
1414                            );
1415                        }
1416                        wp += run_len;
1417                    }
1418                    break;
1419                }
1420            }
1421        }
1422        if wp == n {
1423            writer.write_all(&src[..n])?;
1424        } else if wp > 0 {
1425            writer.write_all(&dst[..wp])?;
1426        }
1427    }
1428    Ok(())
1429}
1430
1431pub fn delete_squeeze(
1432    delete_chars: &[u8],
1433    squeeze_chars: &[u8],
1434    reader: &mut impl Read,
1435    writer: &mut impl Write,
1436) -> io::Result<()> {
1437    let delete_set = build_member_set(delete_chars);
1438    let squeeze_set = build_member_set(squeeze_chars);
1439    let mut buf = vec![0u8; STREAM_BUF];
1440    let mut last_squeezed: u16 = 256;
1441
1442    loop {
1443        let n = read_full(reader, &mut buf)?;
1444        if n == 0 {
1445            break;
1446        }
1447        let mut wp = 0;
1448        unsafe {
1449            let ptr = buf.as_mut_ptr();
1450            for i in 0..n {
1451                let b = *ptr.add(i);
1452                if is_member(&delete_set, b) {
1453                    continue;
1454                }
1455                if is_member(&squeeze_set, b) {
1456                    if last_squeezed == b as u16 {
1457                        continue;
1458                    }
1459                    last_squeezed = b as u16;
1460                } else {
1461                    last_squeezed = 256;
1462                }
1463                *ptr.add(wp) = b;
1464                wp += 1;
1465            }
1466        }
1467        writer.write_all(&buf[..wp])?;
1468    }
1469    Ok(())
1470}
1471
1472pub fn squeeze(
1473    squeeze_chars: &[u8],
1474    reader: &mut impl Read,
1475    writer: &mut impl Write,
1476) -> io::Result<()> {
1477    if squeeze_chars.len() == 1 {
1478        return squeeze_single_stream(squeeze_chars[0], reader, writer);
1479    }
1480
1481    let member = build_member_set(squeeze_chars);
1482    let mut buf = vec![0u8; STREAM_BUF];
1483    let mut last_squeezed: u16 = 256;
1484
1485    loop {
1486        let n = read_full(reader, &mut buf)?;
1487        if n == 0 {
1488            break;
1489        }
1490        let mut wp = 0;
1491        unsafe {
1492            let ptr = buf.as_mut_ptr();
1493            for i in 0..n {
1494                let b = *ptr.add(i);
1495                if is_member(&member, b) {
1496                    if last_squeezed == b as u16 {
1497                        continue;
1498                    }
1499                    last_squeezed = b as u16;
1500                } else {
1501                    last_squeezed = 256;
1502                }
1503                *ptr.add(wp) = b;
1504                wp += 1;
1505            }
1506        }
1507        writer.write_all(&buf[..wp])?;
1508    }
1509    Ok(())
1510}
1511
1512fn squeeze_single_stream(
1513    ch: u8,
1514    reader: &mut impl Read,
1515    writer: &mut impl Write,
1516) -> io::Result<()> {
1517    // Use a two-byte pattern finder (ch,ch) to jump directly to squeeze points.
1518    // For squeeze-spaces: most of the data has no consecutive spaces, so memmem
1519    // skips over huge regions at SIMD speed. When a pair is found, we scan the
1520    // run length and collapse it to one occurrence.
1521    let pair = [ch, ch];
1522    let finder = memchr::memmem::Finder::new(&pair);
1523    let mut buf = vec![0u8; STREAM_BUF];
1524    let mut was_squeeze_char = false;
1525
1526    loop {
1527        let n = read_full(reader, &mut buf)?;
1528        if n == 0 {
1529            break;
1530        }
1531
1532        let ptr = buf.as_mut_ptr();
1533        let mut wp = 0;
1534        let mut i = 0;
1535
1536        // Handle carry-over: if previous chunk ended with squeeze char,
1537        // skip leading occurrences of that char in this chunk.
1538        if was_squeeze_char {
1539            while i < n && unsafe { *ptr.add(i) } == ch {
1540                i += 1;
1541            }
1542            if i >= n {
1543                // Entire chunk is squeeze-char continuation
1544                // was_squeeze_char remains true
1545                continue;
1546            }
1547        }
1548
1549        // Use memmem to find consecutive pairs (ch, ch) — SIMD-accelerated.
1550        // Between pairs, the data passes through unchanged.
1551        loop {
1552            match finder.find(&buf[i..n]) {
1553                Some(offset) => {
1554                    // Copy everything up to and including the first char of the pair
1555                    let copy_len = offset + 1; // include first ch
1556                    if copy_len > 0 && wp != i {
1557                        unsafe {
1558                            std::ptr::copy(ptr.add(i), ptr.add(wp), copy_len);
1559                        }
1560                    }
1561                    wp += copy_len;
1562                    i += offset + 1; // position after first ch of pair
1563                    // Skip all remaining consecutive ch bytes (the run)
1564                    while i < n && unsafe { *ptr.add(i) } == ch {
1565                        i += 1;
1566                    }
1567                    if i >= n {
1568                        was_squeeze_char = true;
1569                        break;
1570                    }
1571                }
1572                None => {
1573                    // No more consecutive pairs — copy remainder
1574                    let run_len = n - i;
1575                    if run_len > 0 && wp != i {
1576                        unsafe {
1577                            std::ptr::copy(ptr.add(i), ptr.add(wp), run_len);
1578                        }
1579                    }
1580                    wp += run_len;
1581                    // Check if chunk ends with the squeeze char
1582                    was_squeeze_char = n > 0 && unsafe { *ptr.add(n - 1) } == ch;
1583                    break;
1584                }
1585            }
1586        }
1587
1588        writer.write_all(&buf[..wp])?;
1589    }
1590    Ok(())
1591}
1592
1593// ============================================================================
1594// Mmap-based functions (zero-copy input from byte slice)
1595// ============================================================================
1596
1597/// Maximum data size for single-allocation translate approach.
1598/// Below this limit, translate ALL data into one buffer and do a single write_all.
1599/// Above this, use chunked approach to limit memory usage.
1600const SINGLE_WRITE_LIMIT: usize = 16 * 1024 * 1024;
1601
1602/// Translate bytes from an mmap'd byte slice.
1603/// Detects single-range translations (e.g., a-z to A-Z) and uses SIMD vectorized
1604/// arithmetic (AVX2: 32 bytes/iter, SSE2: 16 bytes/iter) for those cases.
1605/// Falls back to scalar 256-byte table lookup for general translations.
1606///
1607/// For data >= 2MB: uses rayon parallel processing across multiple cores.
1608/// For data <= 16MB: single allocation + single write_all (1 syscall).
1609/// For data > 16MB: chunked approach to limit memory (N syscalls where N = data/4MB).
1610pub fn translate_mmap(
1611    set1: &[u8],
1612    set2: &[u8],
1613    data: &[u8],
1614    writer: &mut impl Write,
1615) -> io::Result<()> {
1616    let table = build_translate_table(set1, set2);
1617
1618    // Check if table is identity — pure passthrough
1619    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
1620    if is_identity {
1621        return writer.write_all(data);
1622    }
1623
1624    // Try SIMD fast path for single-range constant-offset translations
1625    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
1626        return translate_mmap_range(data, writer, lo, hi, offset);
1627    }
1628
1629    // General case: table lookup (with parallel processing for large data)
1630    translate_mmap_table(data, writer, &table)
1631}
1632
1633/// SIMD range translate for mmap data, with rayon parallel processing.
1634fn translate_mmap_range(
1635    data: &[u8],
1636    writer: &mut impl Write,
1637    lo: u8,
1638    hi: u8,
1639    offset: i8,
1640) -> io::Result<()> {
1641    // Parallel path: split data into chunks, translate each in parallel
1642    if data.len() >= PARALLEL_THRESHOLD {
1643        let mut buf = alloc_uninit_vec(data.len());
1644        let n_threads = rayon::current_num_threads().max(1);
1645        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1646
1647        // Process chunks in parallel: each thread writes to its slice of buf
1648        data.par_chunks(chunk_size)
1649            .zip(buf.par_chunks_mut(chunk_size))
1650            .for_each(|(src_chunk, dst_chunk)| {
1651                translate_range_simd(src_chunk, &mut dst_chunk[..src_chunk.len()], lo, hi, offset);
1652            });
1653
1654        return writer.write_all(&buf);
1655    }
1656
1657    // Small data: single-threaded SIMD
1658    if data.len() <= SINGLE_WRITE_LIMIT {
1659        let mut buf = alloc_uninit_vec(data.len());
1660        translate_range_simd(data, &mut buf, lo, hi, offset);
1661        return writer.write_all(&buf);
1662    }
1663    // Chunked path for large data (shouldn't happen since PARALLEL_THRESHOLD < SINGLE_WRITE_LIMIT)
1664    let mut buf = alloc_uninit_vec(BUF_SIZE);
1665    for chunk in data.chunks(BUF_SIZE) {
1666        translate_range_simd(chunk, &mut buf[..chunk.len()], lo, hi, offset);
1667        writer.write_all(&buf[..chunk.len()])?;
1668    }
1669    Ok(())
1670}
1671
1672/// General table-lookup translate for mmap data, with rayon parallel processing.
1673fn translate_mmap_table(data: &[u8], writer: &mut impl Write, table: &[u8; 256]) -> io::Result<()> {
1674    // Parallel path: split data into chunks, translate each in parallel
1675    if data.len() >= PARALLEL_THRESHOLD {
1676        let mut buf = alloc_uninit_vec(data.len());
1677        let n_threads = rayon::current_num_threads().max(1);
1678        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1679
1680        data.par_chunks(chunk_size)
1681            .zip(buf.par_chunks_mut(chunk_size))
1682            .for_each(|(src_chunk, dst_chunk)| {
1683                translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], table);
1684            });
1685
1686        return writer.write_all(&buf);
1687    }
1688
1689    // Small data: single-threaded
1690    if data.len() <= SINGLE_WRITE_LIMIT {
1691        let mut buf = alloc_uninit_vec(data.len());
1692        translate_to(data, &mut buf, table);
1693        return writer.write_all(&buf);
1694    }
1695    let mut buf = alloc_uninit_vec(BUF_SIZE);
1696    for chunk in data.chunks(BUF_SIZE) {
1697        translate_to(chunk, &mut buf[..chunk.len()], table);
1698        writer.write_all(&buf[..chunk.len()])?;
1699    }
1700    Ok(())
1701}
1702
1703/// Translate + squeeze from mmap'd byte slice.
1704///
1705/// For data >= 2MB: two-phase approach: parallel translate, then sequential squeeze.
1706/// For data <= 16MB: single-pass translate+squeeze into one buffer, one write syscall.
1707/// For data > 16MB: chunked approach to limit memory.
1708pub fn translate_squeeze_mmap(
1709    set1: &[u8],
1710    set2: &[u8],
1711    data: &[u8],
1712    writer: &mut impl Write,
1713) -> io::Result<()> {
1714    let table = build_translate_table(set1, set2);
1715    let squeeze_set = build_member_set(set2);
1716
1717    // For large data: two-phase approach
1718    // Phase 1: parallel translate into buffer
1719    // Phase 2: sequential squeeze IN-PLACE on the translated buffer
1720    //          (squeeze only removes bytes, never grows, so no second allocation needed)
1721    if data.len() >= PARALLEL_THRESHOLD {
1722        // Phase 1: parallel translate
1723        let mut translated = alloc_uninit_vec(data.len());
1724        let range_info = detect_range_offset(&table);
1725        let n_threads = rayon::current_num_threads().max(1);
1726        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1727
1728        if let Some((lo, hi, offset)) = range_info {
1729            data.par_chunks(chunk_size)
1730                .zip(translated.par_chunks_mut(chunk_size))
1731                .for_each(|(src_chunk, dst_chunk)| {
1732                    translate_range_simd(
1733                        src_chunk,
1734                        &mut dst_chunk[..src_chunk.len()],
1735                        lo,
1736                        hi,
1737                        offset,
1738                    );
1739                });
1740        } else {
1741            data.par_chunks(chunk_size)
1742                .zip(translated.par_chunks_mut(chunk_size))
1743                .for_each(|(src_chunk, dst_chunk)| {
1744                    translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], &table);
1745                });
1746        }
1747
1748        // Phase 2: squeeze in-place on the translated buffer.
1749        // Since squeeze only removes bytes (never grows), we can read ahead and
1750        // compact into the same buffer, saving a full data.len() heap allocation.
1751        let mut last_squeezed: u16 = 256;
1752        let len = translated.len();
1753        let mut wp = 0;
1754        unsafe {
1755            let ptr = translated.as_mut_ptr();
1756            let mut i = 0;
1757            while i < len {
1758                let b = *ptr.add(i);
1759                if is_member(&squeeze_set, b) {
1760                    if last_squeezed == b as u16 {
1761                        i += 1;
1762                        continue;
1763                    }
1764                    last_squeezed = b as u16;
1765                } else {
1766                    last_squeezed = 256;
1767                }
1768                *ptr.add(wp) = b;
1769                wp += 1;
1770                i += 1;
1771            }
1772        }
1773        return writer.write_all(&translated[..wp]);
1774    }
1775
1776    // Single-write fast path: translate+squeeze all data in one pass, one write
1777    if data.len() <= SINGLE_WRITE_LIMIT {
1778        let mut buf: Vec<u8> = Vec::with_capacity(data.len());
1779        let mut last_squeezed: u16 = 256;
1780        unsafe {
1781            buf.set_len(data.len());
1782            let outp: *mut u8 = buf.as_mut_ptr();
1783            let inp = data.as_ptr();
1784            let len = data.len();
1785            let mut wp = 0;
1786            let mut i = 0;
1787            while i < len {
1788                let translated = *table.get_unchecked(*inp.add(i) as usize);
1789                if is_member(&squeeze_set, translated) {
1790                    if last_squeezed == translated as u16 {
1791                        i += 1;
1792                        continue;
1793                    }
1794                    last_squeezed = translated as u16;
1795                } else {
1796                    last_squeezed = 256;
1797                }
1798                *outp.add(wp) = translated;
1799                wp += 1;
1800                i += 1;
1801            }
1802            buf.set_len(wp);
1803        }
1804        return writer.write_all(&buf);
1805    }
1806
1807    // Chunked path for large data
1808    let buf_size = data.len().min(BUF_SIZE);
1809    let mut buf = vec![0u8; buf_size];
1810    let mut last_squeezed: u16 = 256;
1811
1812    for chunk in data.chunks(buf_size) {
1813        translate_to(chunk, &mut buf[..chunk.len()], &table);
1814        let mut wp = 0;
1815        unsafe {
1816            let ptr = buf.as_mut_ptr();
1817            for i in 0..chunk.len() {
1818                let b = *ptr.add(i);
1819                if is_member(&squeeze_set, b) {
1820                    if last_squeezed == b as u16 {
1821                        continue;
1822                    }
1823                    last_squeezed = b as u16;
1824                } else {
1825                    last_squeezed = 256;
1826                }
1827                *ptr.add(wp) = b;
1828                wp += 1;
1829            }
1830        }
1831        writer.write_all(&buf[..wp])?;
1832    }
1833    Ok(())
1834}
1835
1836/// Delete from mmap'd byte slice.
1837///
1838/// For data >= 2MB: uses rayon parallel processing across multiple cores.
1839/// For data <= 16MB: delete into one buffer, one write syscall.
1840/// For data > 16MB: chunked approach to limit memory.
1841pub fn delete_mmap(delete_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1842    if delete_chars.len() == 1 {
1843        return delete_single_char_mmap(delete_chars[0], data, writer);
1844    }
1845    if delete_chars.len() <= 3 {
1846        return delete_multi_memchr_mmap(delete_chars, data, writer);
1847    }
1848
1849    // SIMD fast path for contiguous ranges (digits, a-z, A-Z, etc.)
1850    if let Some((lo, hi)) = detect_delete_range(delete_chars) {
1851        return delete_range_mmap(data, writer, lo, hi);
1852    }
1853
1854    let member = build_member_set(delete_chars);
1855
1856    // Parallel path: pre-allocate a single output buffer of data.len() and have each
1857    // thread write to its non-overlapping slice, then do a single write_all.
1858    // This avoids per-chunk Vec allocations that the old approach had.
1859    if data.len() >= PARALLEL_THRESHOLD {
1860        let n_threads = rayon::current_num_threads().max(1);
1861        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1862
1863        // Each thread deletes into its slice of outbuf and returns bytes written.
1864        let mut outbuf = alloc_uninit_vec(data.len());
1865        let chunk_lens: Vec<usize> = data
1866            .par_chunks(chunk_size)
1867            .zip(outbuf.par_chunks_mut(chunk_size))
1868            .map(|(src_chunk, dst_chunk)| delete_chunk_bitset_into(src_chunk, &member, dst_chunk))
1869            .collect();
1870
1871        // Compact: move each chunk's output to be contiguous.
1872        // chunk_lens[i] is how many bytes thread i wrote into its slice.
1873        // We need to shift them together since each dst_chunk started at chunk_size offsets.
1874        let mut write_pos = 0;
1875        let mut src_offset = 0;
1876        for &clen in &chunk_lens {
1877            if clen > 0 && src_offset != write_pos {
1878                unsafe {
1879                    std::ptr::copy(
1880                        outbuf.as_ptr().add(src_offset),
1881                        outbuf.as_mut_ptr().add(write_pos),
1882                        clen,
1883                    );
1884                }
1885            }
1886            write_pos += clen;
1887            src_offset += chunk_size;
1888        }
1889
1890        return writer.write_all(&outbuf[..write_pos]);
1891    }
1892
1893    // Single-write fast path: delete into one buffer, one write
1894    if data.len() <= SINGLE_WRITE_LIMIT {
1895        let mut outbuf = alloc_uninit_vec(data.len());
1896        let out_pos = delete_chunk_bitset_into(data, &member, &mut outbuf);
1897        return writer.write_all(&outbuf[..out_pos]);
1898    }
1899
1900    // Chunked path for large data
1901    let buf_size = data.len().min(BUF_SIZE);
1902    let mut outbuf = alloc_uninit_vec(buf_size);
1903
1904    for chunk in data.chunks(buf_size) {
1905        let out_pos = delete_chunk_bitset_into(chunk, &member, &mut outbuf);
1906        writer.write_all(&outbuf[..out_pos])?;
1907    }
1908    Ok(())
1909}
1910
1911/// SIMD range delete for mmap data, with rayon parallel processing.
1912fn delete_range_mmap(data: &[u8], writer: &mut impl Write, lo: u8, hi: u8) -> io::Result<()> {
1913    // Parallel path: each thread deletes from its chunk into a local Vec
1914    if data.len() >= PARALLEL_THRESHOLD {
1915        let n_threads = rayon::current_num_threads().max(1);
1916        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1917
1918        let results: Vec<Vec<u8>> = data
1919            .par_chunks(chunk_size)
1920            .map(|chunk| {
1921                let mut out = alloc_uninit_vec(chunk.len());
1922                let wp = delete_range_chunk(chunk, &mut out, lo, hi);
1923                unsafe { out.set_len(wp) };
1924                out
1925            })
1926            .collect();
1927
1928        let slices: Vec<std::io::IoSlice> = results
1929            .iter()
1930            .filter(|r| !r.is_empty())
1931            .map(|r| std::io::IoSlice::new(r))
1932            .collect();
1933        return write_ioslices(writer, &slices);
1934    }
1935
1936    // Single-write fast path
1937    if data.len() <= SINGLE_WRITE_LIMIT {
1938        let mut outbuf = alloc_uninit_vec(data.len());
1939        let wp = delete_range_chunk(data, &mut outbuf, lo, hi);
1940        return writer.write_all(&outbuf[..wp]);
1941    }
1942
1943    // Chunked path
1944    let mut outbuf = alloc_uninit_vec(BUF_SIZE);
1945    for chunk in data.chunks(BUF_SIZE) {
1946        let wp = delete_range_chunk(chunk, &mut outbuf, lo, hi);
1947        writer.write_all(&outbuf[..wp])?;
1948    }
1949    Ok(())
1950}
1951
1952/// Delete bytes from chunk using bitset, writing into pre-allocated buffer.
1953/// Returns number of bytes written.
1954#[inline]
1955fn delete_chunk_bitset_into(chunk: &[u8], member: &[u8; 32], outbuf: &mut [u8]) -> usize {
1956    let len = chunk.len();
1957    let mut out_pos = 0;
1958    let mut i = 0;
1959
1960    while i + 8 <= len {
1961        unsafe {
1962            let b0 = *chunk.get_unchecked(i);
1963            let b1 = *chunk.get_unchecked(i + 1);
1964            let b2 = *chunk.get_unchecked(i + 2);
1965            let b3 = *chunk.get_unchecked(i + 3);
1966            let b4 = *chunk.get_unchecked(i + 4);
1967            let b5 = *chunk.get_unchecked(i + 5);
1968            let b6 = *chunk.get_unchecked(i + 6);
1969            let b7 = *chunk.get_unchecked(i + 7);
1970
1971            *outbuf.get_unchecked_mut(out_pos) = b0;
1972            out_pos += !is_member(member, b0) as usize;
1973            *outbuf.get_unchecked_mut(out_pos) = b1;
1974            out_pos += !is_member(member, b1) as usize;
1975            *outbuf.get_unchecked_mut(out_pos) = b2;
1976            out_pos += !is_member(member, b2) as usize;
1977            *outbuf.get_unchecked_mut(out_pos) = b3;
1978            out_pos += !is_member(member, b3) as usize;
1979            *outbuf.get_unchecked_mut(out_pos) = b4;
1980            out_pos += !is_member(member, b4) as usize;
1981            *outbuf.get_unchecked_mut(out_pos) = b5;
1982            out_pos += !is_member(member, b5) as usize;
1983            *outbuf.get_unchecked_mut(out_pos) = b6;
1984            out_pos += !is_member(member, b6) as usize;
1985            *outbuf.get_unchecked_mut(out_pos) = b7;
1986            out_pos += !is_member(member, b7) as usize;
1987        }
1988        i += 8;
1989    }
1990
1991    while i < len {
1992        unsafe {
1993            let b = *chunk.get_unchecked(i);
1994            *outbuf.get_unchecked_mut(out_pos) = b;
1995            out_pos += !is_member(member, b) as usize;
1996        }
1997        i += 1;
1998    }
1999
2000    out_pos
2001}
2002
2003fn delete_single_char_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2004    // Parallel path for large data: each thread deletes from its chunk,
2005    // then use writev to write all results in one syscall batch.
2006    if data.len() >= PARALLEL_THRESHOLD {
2007        let n_threads = rayon::current_num_threads().max(1);
2008        let chunk_size = (data.len() / n_threads).max(32 * 1024);
2009
2010        let results: Vec<Vec<u8>> = data
2011            .par_chunks(chunk_size)
2012            .map(|chunk| {
2013                let mut out = Vec::with_capacity(chunk.len());
2014                let mut last = 0;
2015                for pos in memchr::memchr_iter(ch, chunk) {
2016                    if pos > last {
2017                        out.extend_from_slice(&chunk[last..pos]);
2018                    }
2019                    last = pos + 1;
2020                }
2021                if last < chunk.len() {
2022                    out.extend_from_slice(&chunk[last..]);
2023                }
2024                out
2025            })
2026            .collect();
2027
2028        // Use writev to batch all results into fewer syscalls
2029        let slices: Vec<std::io::IoSlice> = results
2030            .iter()
2031            .filter(|r| !r.is_empty())
2032            .map(|r| std::io::IoSlice::new(r))
2033            .collect();
2034        return write_ioslices(writer, &slices);
2035    }
2036
2037    // Single-write fast path: collect all non-deleted spans into one buffer
2038    if data.len() <= SINGLE_WRITE_LIMIT {
2039        let mut outbuf = Vec::with_capacity(data.len());
2040        let mut last = 0;
2041        for pos in memchr::memchr_iter(ch, data) {
2042            if pos > last {
2043                outbuf.extend_from_slice(&data[last..pos]);
2044            }
2045            last = pos + 1;
2046        }
2047        if last < data.len() {
2048            outbuf.extend_from_slice(&data[last..]);
2049        }
2050        return writer.write_all(&outbuf);
2051    }
2052
2053    // Chunked path for large data
2054    let buf_size = data.len().min(BUF_SIZE);
2055    let mut outbuf = vec![0u8; buf_size];
2056
2057    for chunk in data.chunks(buf_size) {
2058        let mut wp = 0;
2059        let mut last = 0;
2060        for pos in memchr::memchr_iter(ch, chunk) {
2061            if pos > last {
2062                let run = pos - last;
2063                outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
2064                wp += run;
2065            }
2066            last = pos + 1;
2067        }
2068        if last < chunk.len() {
2069            let run = chunk.len() - last;
2070            outbuf[wp..wp + run].copy_from_slice(&chunk[last..]);
2071            wp += run;
2072        }
2073        writer.write_all(&outbuf[..wp])?;
2074    }
2075    Ok(())
2076}
2077
2078fn delete_multi_memchr_mmap(chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2079    let c0 = chars[0];
2080    let c1 = if chars.len() >= 2 { chars[1] } else { 0 };
2081    let c2 = if chars.len() >= 3 { chars[2] } else { 0 };
2082    let is_three = chars.len() >= 3;
2083
2084    // Parallel path for large data
2085    if data.len() >= PARALLEL_THRESHOLD {
2086        let n_threads = rayon::current_num_threads().max(1);
2087        let chunk_size = (data.len() / n_threads).max(32 * 1024);
2088
2089        let results: Vec<Vec<u8>> = data
2090            .par_chunks(chunk_size)
2091            .map(|chunk| {
2092                let mut out = Vec::with_capacity(chunk.len());
2093                let mut last = 0;
2094                if is_three {
2095                    for pos in memchr::memchr3_iter(c0, c1, c2, chunk) {
2096                        if pos > last {
2097                            out.extend_from_slice(&chunk[last..pos]);
2098                        }
2099                        last = pos + 1;
2100                    }
2101                } else {
2102                    for pos in memchr::memchr2_iter(c0, c1, chunk) {
2103                        if pos > last {
2104                            out.extend_from_slice(&chunk[last..pos]);
2105                        }
2106                        last = pos + 1;
2107                    }
2108                }
2109                if last < chunk.len() {
2110                    out.extend_from_slice(&chunk[last..]);
2111                }
2112                out
2113            })
2114            .collect();
2115
2116        // Use writev to batch all results into fewer syscalls
2117        let slices: Vec<std::io::IoSlice> = results
2118            .iter()
2119            .filter(|r| !r.is_empty())
2120            .map(|r| std::io::IoSlice::new(r))
2121            .collect();
2122        return write_ioslices(writer, &slices);
2123    }
2124
2125    // Single-write fast path: collect all non-deleted spans into one buffer
2126    if data.len() <= SINGLE_WRITE_LIMIT {
2127        let mut outbuf = Vec::with_capacity(data.len());
2128        let mut last = 0;
2129        if is_three {
2130            for pos in memchr::memchr3_iter(c0, c1, c2, data) {
2131                if pos > last {
2132                    outbuf.extend_from_slice(&data[last..pos]);
2133                }
2134                last = pos + 1;
2135            }
2136        } else {
2137            for pos in memchr::memchr2_iter(c0, c1, data) {
2138                if pos > last {
2139                    outbuf.extend_from_slice(&data[last..pos]);
2140                }
2141                last = pos + 1;
2142            }
2143        }
2144        if last < data.len() {
2145            outbuf.extend_from_slice(&data[last..]);
2146        }
2147        return writer.write_all(&outbuf);
2148    }
2149
2150    // Chunked path for large data
2151    let buf_size = data.len().min(BUF_SIZE);
2152    let mut outbuf = vec![0u8; buf_size];
2153
2154    for chunk in data.chunks(buf_size) {
2155        let mut wp = 0;
2156        let mut last = 0;
2157
2158        // Iterate directly over memchr iterator without collecting into Vec<usize>.
2159        // Positions are used exactly once in order, so no intermediate allocation needed.
2160        if is_three {
2161            for pos in memchr::memchr3_iter(c0, c1, c2, chunk) {
2162                if pos > last {
2163                    let run = pos - last;
2164                    outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
2165                    wp += run;
2166                }
2167                last = pos + 1;
2168            }
2169        } else {
2170            for pos in memchr::memchr2_iter(c0, c1, chunk) {
2171                if pos > last {
2172                    let run = pos - last;
2173                    outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
2174                    wp += run;
2175                }
2176                last = pos + 1;
2177            }
2178        }
2179
2180        if last < chunk.len() {
2181            let run = chunk.len() - last;
2182            outbuf[wp..wp + run].copy_from_slice(&chunk[last..]);
2183            wp += run;
2184        }
2185        writer.write_all(&outbuf[..wp])?;
2186    }
2187    Ok(())
2188}
2189
2190/// Delete + squeeze from mmap'd byte slice.
2191///
2192/// For data <= 16MB: delete+squeeze into one buffer, one write syscall.
2193/// For data > 16MB: chunked approach to limit memory.
2194pub fn delete_squeeze_mmap(
2195    delete_chars: &[u8],
2196    squeeze_chars: &[u8],
2197    data: &[u8],
2198    writer: &mut impl Write,
2199) -> io::Result<()> {
2200    let delete_set = build_member_set(delete_chars);
2201    let squeeze_set = build_member_set(squeeze_chars);
2202
2203    // Single-write fast path: delete+squeeze all data in one pass, one write
2204    if data.len() <= SINGLE_WRITE_LIMIT {
2205        let mut outbuf: Vec<u8> = Vec::with_capacity(data.len());
2206        let mut last_squeezed: u16 = 256;
2207        unsafe {
2208            outbuf.set_len(data.len());
2209            let outp: *mut u8 = outbuf.as_mut_ptr();
2210            let inp = data.as_ptr();
2211            let len = data.len();
2212            let mut out_pos = 0;
2213            let mut i = 0;
2214            while i < len {
2215                let b = *inp.add(i);
2216                if is_member(&delete_set, b) {
2217                    i += 1;
2218                    continue;
2219                }
2220                if is_member(&squeeze_set, b) {
2221                    if last_squeezed == b as u16 {
2222                        i += 1;
2223                        continue;
2224                    }
2225                    last_squeezed = b as u16;
2226                } else {
2227                    last_squeezed = 256;
2228                }
2229                *outp.add(out_pos) = b;
2230                out_pos += 1;
2231                i += 1;
2232            }
2233            outbuf.set_len(out_pos);
2234        }
2235        return writer.write_all(&outbuf);
2236    }
2237
2238    // Chunked path for large data
2239    let buf_size = data.len().min(BUF_SIZE);
2240    let mut outbuf = vec![0u8; buf_size];
2241    let mut last_squeezed: u16 = 256;
2242
2243    for chunk in data.chunks(buf_size) {
2244        let mut out_pos = 0;
2245        for &b in chunk {
2246            if is_member(&delete_set, b) {
2247                continue;
2248            }
2249            if is_member(&squeeze_set, b) {
2250                if last_squeezed == b as u16 {
2251                    continue;
2252                }
2253                last_squeezed = b as u16;
2254            } else {
2255                last_squeezed = 256;
2256            }
2257            unsafe {
2258                *outbuf.get_unchecked_mut(out_pos) = b;
2259            }
2260            out_pos += 1;
2261        }
2262        writer.write_all(&outbuf[..out_pos])?;
2263    }
2264    Ok(())
2265}
2266
2267/// Squeeze from mmap'd byte slice.
2268///
2269/// For data >= 2MB: uses rayon parallel processing with boundary fixup.
2270/// For data <= 16MB: squeeze into one buffer, one write syscall.
2271/// For data > 16MB: chunked approach to limit memory.
2272pub fn squeeze_mmap(squeeze_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2273    if squeeze_chars.len() == 1 {
2274        return squeeze_single_mmap(squeeze_chars[0], data, writer);
2275    }
2276    if squeeze_chars.len() == 2 {
2277        return squeeze_multi_mmap::<2>(squeeze_chars, data, writer);
2278    }
2279    if squeeze_chars.len() == 3 {
2280        return squeeze_multi_mmap::<3>(squeeze_chars, data, writer);
2281    }
2282
2283    let member = build_member_set(squeeze_chars);
2284
2285    // Parallel path: squeeze each chunk independently, then fix boundaries
2286    if data.len() >= PARALLEL_THRESHOLD {
2287        let n_threads = rayon::current_num_threads().max(1);
2288        let chunk_size = (data.len() / n_threads).max(32 * 1024);
2289
2290        let results: Vec<Vec<u8>> = data
2291            .par_chunks(chunk_size)
2292            .map(|chunk| squeeze_chunk_bitset(chunk, &member))
2293            .collect();
2294
2295        // Build IoSlice list, fixing boundaries: if chunk N ends with byte B
2296        // and chunk N+1 starts with same byte B, and B is in squeeze set,
2297        // skip the first byte(s) of chunk N+1 that equal B.
2298        // Collect slices for writev to minimize syscalls.
2299        let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
2300        for (idx, result) in results.iter().enumerate() {
2301            if result.is_empty() {
2302                continue;
2303            }
2304            if idx > 0 {
2305                // Check boundary: does previous chunk end with same squeezable byte?
2306                if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
2307                    if is_member(&member, prev_last) {
2308                        // Skip leading bytes in this chunk that equal prev_last
2309                        let skip = result.iter().take_while(|&&b| b == prev_last).count();
2310                        if skip < result.len() {
2311                            slices.push(std::io::IoSlice::new(&result[skip..]));
2312                        }
2313                        continue;
2314                    }
2315                }
2316            }
2317            slices.push(std::io::IoSlice::new(result));
2318        }
2319        return write_ioslices(writer, &slices);
2320    }
2321
2322    // Single-write fast path: squeeze all data into one buffer, one write
2323    if data.len() <= SINGLE_WRITE_LIMIT {
2324        let mut outbuf: Vec<u8> = Vec::with_capacity(data.len());
2325        let mut last_squeezed: u16 = 256;
2326        let len = data.len();
2327        let mut wp = 0;
2328        let mut i = 0;
2329
2330        unsafe {
2331            outbuf.set_len(data.len());
2332            let inp = data.as_ptr();
2333            let outp: *mut u8 = outbuf.as_mut_ptr();
2334
2335            while i < len {
2336                let b = *inp.add(i);
2337                if is_member(&member, b) {
2338                    if last_squeezed != b as u16 {
2339                        *outp.add(wp) = b;
2340                        wp += 1;
2341                        last_squeezed = b as u16;
2342                    }
2343                    i += 1;
2344                    while i < len && *inp.add(i) == b {
2345                        i += 1;
2346                    }
2347                } else {
2348                    last_squeezed = 256;
2349                    *outp.add(wp) = b;
2350                    wp += 1;
2351                    i += 1;
2352                }
2353            }
2354            outbuf.set_len(wp);
2355        }
2356        return writer.write_all(&outbuf);
2357    }
2358
2359    // Chunked path for large data
2360    let buf_size = data.len().min(BUF_SIZE);
2361    let mut outbuf = vec![0u8; buf_size];
2362    let mut last_squeezed: u16 = 256;
2363
2364    for chunk in data.chunks(buf_size) {
2365        let len = chunk.len();
2366        let mut wp = 0;
2367        let mut i = 0;
2368
2369        unsafe {
2370            let inp = chunk.as_ptr();
2371            let outp = outbuf.as_mut_ptr();
2372
2373            while i < len {
2374                let b = *inp.add(i);
2375                if is_member(&member, b) {
2376                    if last_squeezed != b as u16 {
2377                        *outp.add(wp) = b;
2378                        wp += 1;
2379                        last_squeezed = b as u16;
2380                    }
2381                    i += 1;
2382                    while i < len && *inp.add(i) == b {
2383                        i += 1;
2384                    }
2385                } else {
2386                    last_squeezed = 256;
2387                    *outp.add(wp) = b;
2388                    wp += 1;
2389                    i += 1;
2390                }
2391            }
2392        }
2393        writer.write_all(&outbuf[..wp])?;
2394    }
2395    Ok(())
2396}
2397
2398/// Squeeze a single chunk using bitset membership. Returns squeezed output.
2399fn squeeze_chunk_bitset(chunk: &[u8], member: &[u8; 32]) -> Vec<u8> {
2400    let len = chunk.len();
2401    let mut out = Vec::with_capacity(len);
2402    let mut last_squeezed: u16 = 256;
2403    let mut i = 0;
2404
2405    unsafe {
2406        out.set_len(len);
2407        let inp = chunk.as_ptr();
2408        let outp: *mut u8 = out.as_mut_ptr();
2409        let mut wp = 0;
2410
2411        while i < len {
2412            let b = *inp.add(i);
2413            if is_member(member, b) {
2414                if last_squeezed != b as u16 {
2415                    *outp.add(wp) = b;
2416                    wp += 1;
2417                    last_squeezed = b as u16;
2418                }
2419                i += 1;
2420                while i < len && *inp.add(i) == b {
2421                    i += 1;
2422                }
2423            } else {
2424                last_squeezed = 256;
2425                *outp.add(wp) = b;
2426                wp += 1;
2427                i += 1;
2428            }
2429        }
2430        out.set_len(wp);
2431    }
2432    out
2433}
2434
2435fn squeeze_multi_mmap<const N: usize>(
2436    chars: &[u8],
2437    data: &[u8],
2438    writer: &mut impl Write,
2439) -> io::Result<()> {
2440    // Parallel path for large data: squeeze each chunk, fix boundaries with writev
2441    if data.len() >= PARALLEL_THRESHOLD {
2442        let member = build_member_set(chars);
2443        let n_threads = rayon::current_num_threads().max(1);
2444        let chunk_size = (data.len() / n_threads).max(32 * 1024);
2445
2446        let results: Vec<Vec<u8>> = data
2447            .par_chunks(chunk_size)
2448            .map(|chunk| squeeze_chunk_bitset(chunk, &member))
2449            .collect();
2450
2451        // Build IoSlice list, fixing boundaries
2452        let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
2453        for (idx, result) in results.iter().enumerate() {
2454            if result.is_empty() {
2455                continue;
2456            }
2457            if idx > 0 {
2458                if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
2459                    if is_member(&member, prev_last) {
2460                        let skip = result.iter().take_while(|&&b| b == prev_last).count();
2461                        if skip < result.len() {
2462                            slices.push(std::io::IoSlice::new(&result[skip..]));
2463                        }
2464                        continue;
2465                    }
2466                }
2467            }
2468            slices.push(std::io::IoSlice::new(result));
2469        }
2470        return write_ioslices(writer, &slices);
2471    }
2472
2473    let buf_size = data.len().min(BUF_SIZE);
2474    let mut outbuf = vec![0u8; buf_size];
2475    let mut wp = 0;
2476    let mut last_squeezed: u16 = 256;
2477    let mut cursor = 0;
2478
2479    macro_rules! find_next {
2480        ($data:expr) => {
2481            if N == 2 {
2482                memchr::memchr2(chars[0], chars[1], $data)
2483            } else {
2484                memchr::memchr3(chars[0], chars[1], chars[2], $data)
2485            }
2486        };
2487    }
2488
2489    macro_rules! flush_and_copy {
2490        ($src:expr, $len:expr) => {
2491            if wp + $len > buf_size {
2492                writer.write_all(&outbuf[..wp])?;
2493                wp = 0;
2494            }
2495            if $len > buf_size {
2496                writer.write_all($src)?;
2497            } else {
2498                outbuf[wp..wp + $len].copy_from_slice($src);
2499                wp += $len;
2500            }
2501        };
2502    }
2503
2504    while cursor < data.len() {
2505        match find_next!(&data[cursor..]) {
2506            Some(offset) => {
2507                let pos = cursor + offset;
2508                let b = data[pos];
2509                if pos > cursor {
2510                    let span = pos - cursor;
2511                    flush_and_copy!(&data[cursor..pos], span);
2512                    last_squeezed = 256;
2513                }
2514                if last_squeezed != b as u16 {
2515                    if wp >= buf_size {
2516                        writer.write_all(&outbuf[..wp])?;
2517                        wp = 0;
2518                    }
2519                    outbuf[wp] = b;
2520                    wp += 1;
2521                    last_squeezed = b as u16;
2522                }
2523                let mut skip = pos + 1;
2524                while skip < data.len() && data[skip] == b {
2525                    skip += 1;
2526                }
2527                cursor = skip;
2528            }
2529            None => {
2530                let remaining = data.len() - cursor;
2531                flush_and_copy!(&data[cursor..], remaining);
2532                break;
2533            }
2534        }
2535    }
2536    if wp > 0 {
2537        writer.write_all(&outbuf[..wp])?;
2538    }
2539    Ok(())
2540}
2541
2542fn squeeze_single_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2543    if data.is_empty() {
2544        return Ok(());
2545    }
2546
2547    if memchr::memmem::find(data, &[ch, ch]).is_none() {
2548        return writer.write_all(data);
2549    }
2550
2551    // Parallel path: squeeze each chunk, fix boundaries
2552    if data.len() >= PARALLEL_THRESHOLD {
2553        let n_threads = rayon::current_num_threads().max(1);
2554        let chunk_size = (data.len() / n_threads).max(32 * 1024);
2555
2556        let results: Vec<Vec<u8>> = data
2557            .par_chunks(chunk_size)
2558            .map(|chunk| {
2559                let mut out = Vec::with_capacity(chunk.len());
2560                let mut cursor = 0;
2561                while cursor < chunk.len() {
2562                    match memchr::memchr(ch, &chunk[cursor..]) {
2563                        Some(offset) => {
2564                            let pos = cursor + offset;
2565                            if pos > cursor {
2566                                out.extend_from_slice(&chunk[cursor..pos]);
2567                            }
2568                            out.push(ch);
2569                            cursor = pos + 1;
2570                            while cursor < chunk.len() && chunk[cursor] == ch {
2571                                cursor += 1;
2572                            }
2573                        }
2574                        None => {
2575                            out.extend_from_slice(&chunk[cursor..]);
2576                            break;
2577                        }
2578                    }
2579                }
2580                out
2581            })
2582            .collect();
2583
2584        // Build IoSlice list, fixing boundary squeezability.
2585        // Use writev to minimize syscalls.
2586        let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
2587        for (idx, result) in results.iter().enumerate() {
2588            if result.is_empty() {
2589                continue;
2590            }
2591            if idx > 0 {
2592                if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
2593                    if prev_last == ch {
2594                        // Skip leading ch bytes in this chunk result
2595                        let skip = result.iter().take_while(|&&b| b == ch).count();
2596                        if skip < result.len() {
2597                            slices.push(std::io::IoSlice::new(&result[skip..]));
2598                        }
2599                        continue;
2600                    }
2601                }
2602            }
2603            slices.push(std::io::IoSlice::new(result));
2604        }
2605        return write_ioslices(writer, &slices);
2606    }
2607
2608    let buf_size = data.len().min(BUF_SIZE);
2609    let mut outbuf = vec![0u8; buf_size];
2610    let len = data.len();
2611    let mut wp = 0;
2612    let mut cursor = 0;
2613
2614    while cursor < len {
2615        match memchr::memchr(ch, &data[cursor..]) {
2616            Some(offset) => {
2617                let pos = cursor + offset;
2618                let gap = pos - cursor;
2619                if gap > 0 {
2620                    if wp + gap > buf_size {
2621                        writer.write_all(&outbuf[..wp])?;
2622                        wp = 0;
2623                    }
2624                    if gap > buf_size {
2625                        writer.write_all(&data[cursor..pos])?;
2626                    } else {
2627                        outbuf[wp..wp + gap].copy_from_slice(&data[cursor..pos]);
2628                        wp += gap;
2629                    }
2630                }
2631                if wp >= buf_size {
2632                    writer.write_all(&outbuf[..wp])?;
2633                    wp = 0;
2634                }
2635                outbuf[wp] = ch;
2636                wp += 1;
2637                cursor = pos + 1;
2638                while cursor < len && data[cursor] == ch {
2639                    cursor += 1;
2640                }
2641            }
2642            None => {
2643                let remaining = len - cursor;
2644                if remaining > 0 {
2645                    if wp + remaining > buf_size {
2646                        writer.write_all(&outbuf[..wp])?;
2647                        wp = 0;
2648                    }
2649                    if remaining > buf_size {
2650                        writer.write_all(&data[cursor..])?;
2651                    } else {
2652                        outbuf[wp..wp + remaining].copy_from_slice(&data[cursor..]);
2653                        wp += remaining;
2654                    }
2655                }
2656                break;
2657            }
2658        }
2659    }
2660
2661    if wp > 0 {
2662        writer.write_all(&outbuf[..wp])?;
2663    }
2664    Ok(())
2665}