Skip to main content

coreutils_rs/tr/
core.rs

1use std::io::{self, Read, Write};
2
3use rayon::prelude::*;
4
5/// Maximum IoSlice entries per write_vectored batch.
6/// Linux UIO_MAXIOV is 1024; we use that as our batch limit.
7const MAX_IOV: usize = 1024;
8
9/// Main processing buffer: 4MB (fits in L3 cache, avoids cache thrashing).
10const BUF_SIZE: usize = 4 * 1024 * 1024;
11
12/// Stream buffer: 16MB — tr streaming operations (translate, delete, squeeze)
13/// are compute-light (single table lookup or bitset check per byte), so the
14/// bottleneck is I/O syscalls, not cache pressure. 16MB buffer means only
15/// 1 read()/write() syscall pair for a 10MB input, minimizing syscall overhead.
16/// For piped input, read_full retries partial reads to fill the entire buffer.
17/// This applies to ALL streaming modes (delete, squeeze, translate).
18const STREAM_BUF: usize = 16 * 1024 * 1024;
19
20/// Minimum data size to engage rayon parallel processing for mmap paths.
21/// Below this, single-threaded is faster due to thread pool overhead.
22const PARALLEL_THRESHOLD: usize = 2 * 1024 * 1024;
23
24/// Write multiple IoSlice buffers using write_vectored, batching into MAX_IOV-sized groups.
25/// Falls back to write_all per slice for partial writes.
26#[inline]
27fn write_ioslices(writer: &mut impl Write, slices: &[std::io::IoSlice]) -> io::Result<()> {
28    if slices.is_empty() {
29        return Ok(());
30    }
31    for batch in slices.chunks(MAX_IOV) {
32        let total: usize = batch.iter().map(|s| s.len()).sum();
33        match writer.write_vectored(batch) {
34            Ok(n) if n >= total => continue,
35            Ok(mut written) => {
36                // Partial write: fall back to write_all per remaining slice
37                for slice in batch {
38                    let slen = slice.len();
39                    if written >= slen {
40                        written -= slen;
41                        continue;
42                    }
43                    if written > 0 {
44                        writer.write_all(&slice[written..])?;
45                        written = 0;
46                    } else {
47                        writer.write_all(slice)?;
48                    }
49                }
50            }
51            Err(e) => return Err(e),
52        }
53    }
54    Ok(())
55}
56
57/// Allocate a Vec<u8> of given length without zero-initialization.
58/// SAFETY: Caller must write all bytes before reading them.
59#[inline]
60#[allow(clippy::uninit_vec)]
61fn alloc_uninit_vec(len: usize) -> Vec<u8> {
62    let mut v = Vec::with_capacity(len);
63    // SAFETY: u8 has no drop, no invalid bit patterns; caller will overwrite before reading
64    unsafe {
65        v.set_len(len);
66    }
67    v
68}
69
70/// Build a 256-byte lookup table mapping set1[i] -> set2[i].
71#[inline]
72fn build_translate_table(set1: &[u8], set2: &[u8]) -> [u8; 256] {
73    let mut table: [u8; 256] = std::array::from_fn(|i| i as u8);
74    let last = set2.last().copied();
75    for (i, &from) in set1.iter().enumerate() {
76        table[from as usize] = if i < set2.len() {
77            set2[i]
78        } else {
79            last.unwrap_or(from)
80        };
81    }
82    table
83}
84
85/// Build a 256-bit (32-byte) membership set for O(1) byte lookup.
86#[inline]
87fn build_member_set(chars: &[u8]) -> [u8; 32] {
88    let mut set = [0u8; 32];
89    for &ch in chars {
90        set[ch as usize >> 3] |= 1 << (ch & 7);
91    }
92    set
93}
94
95#[inline(always)]
96fn is_member(set: &[u8; 32], ch: u8) -> bool {
97    unsafe { (*set.get_unchecked(ch as usize >> 3) & (1 << (ch & 7))) != 0 }
98}
99
100/// Translate bytes in-place using a 256-byte lookup table.
101/// On x86_64 with SSSE3+, uses SIMD pshufb-based nibble decomposition for
102/// ~32 bytes per iteration. Falls back to 8x-unrolled scalar on other platforms.
103#[inline(always)]
104fn translate_inplace(data: &mut [u8], table: &[u8; 256]) {
105    #[cfg(target_arch = "x86_64")]
106    {
107        if is_x86_feature_detected!("avx2") {
108            unsafe { translate_inplace_avx2_table(data, table) };
109            return;
110        }
111        if is_x86_feature_detected!("ssse3") {
112            unsafe { translate_inplace_ssse3_table(data, table) };
113            return;
114        }
115    }
116    translate_inplace_scalar(data, table);
117}
118
119/// Scalar fallback: 8x-unrolled table lookup.
120#[inline(always)]
121fn translate_inplace_scalar(data: &mut [u8], table: &[u8; 256]) {
122    let len = data.len();
123    let ptr = data.as_mut_ptr();
124    let mut i = 0;
125    unsafe {
126        while i + 8 <= len {
127            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
128            *ptr.add(i + 1) = *table.get_unchecked(*ptr.add(i + 1) as usize);
129            *ptr.add(i + 2) = *table.get_unchecked(*ptr.add(i + 2) as usize);
130            *ptr.add(i + 3) = *table.get_unchecked(*ptr.add(i + 3) as usize);
131            *ptr.add(i + 4) = *table.get_unchecked(*ptr.add(i + 4) as usize);
132            *ptr.add(i + 5) = *table.get_unchecked(*ptr.add(i + 5) as usize);
133            *ptr.add(i + 6) = *table.get_unchecked(*ptr.add(i + 6) as usize);
134            *ptr.add(i + 7) = *table.get_unchecked(*ptr.add(i + 7) as usize);
135            i += 8;
136        }
137        while i < len {
138            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
139            i += 1;
140        }
141    }
142}
143
144// ============================================================================
145// SIMD arbitrary table lookup using pshufb nibble decomposition (x86_64)
146// ============================================================================
147//
148// For an arbitrary 256-byte lookup table, we decompose each byte into
149// high nibble (bits 7-4) and low nibble (bits 3-0). We pre-build 16
150// SIMD vectors, one for each high nibble value h (0..15), containing
151// the 16 table entries table[h*16+0..h*16+15]. Then for each input
152// vector we:
153//   1. Extract low nibble (AND 0x0F) -> used as pshufb index
154//   2. Extract high nibble (shift right 4) -> used to select which table
155//   3. For each of the 16 high nibble values, create a mask where
156//      the high nibble equals that value, pshufb the corresponding
157//      table, and accumulate results
158//
159// AVX2 processes 32 bytes/iteration; SSSE3 processes 16 bytes/iteration.
160// With instruction-level parallelism, this achieves much higher throughput
161// than scalar table lookups which have serial data dependencies.
162
163#[cfg(target_arch = "x86_64")]
164#[target_feature(enable = "avx2")]
165unsafe fn translate_inplace_avx2_table(data: &mut [u8], table: &[u8; 256]) {
166    use std::arch::x86_64::*;
167
168    unsafe {
169        let len = data.len();
170        let ptr = data.as_mut_ptr();
171
172        // Pre-build 16 lookup vectors, one per high nibble value.
173        // Each vector holds 32 bytes = 2x 128-bit lanes, each lane has the same
174        // 16 table entries for pshufb indexing by low nibble.
175        let mut lut = [_mm256_setzero_si256(); 16];
176        for h in 0u8..16 {
177            let base = (h as usize) * 16;
178            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
179            // Broadcast the 128-bit row to both lanes of the 256-bit vector
180            let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
181            lut[h as usize] = _mm256_broadcastsi128_si256(row128);
182        }
183
184        let lo_mask = _mm256_set1_epi8(0x0F);
185
186        let mut i = 0;
187        while i + 32 <= len {
188            let input = _mm256_loadu_si256(ptr.add(i) as *const _);
189            let lo_nibble = _mm256_and_si256(input, lo_mask);
190            let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
191
192            // Accumulate result: for each high nibble value h, select bytes where
193            // hi_nibble == h, look up in lut[h] using lo_nibble, blend into result.
194            let mut result = _mm256_setzero_si256();
195
196            // Unroll the 16 high-nibble iterations for best ILP
197            macro_rules! do_nibble {
198                ($h:expr) => {
199                    let h_val = _mm256_set1_epi8($h as i8);
200                    let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
201                    let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
202                    result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
203                };
204            }
205            do_nibble!(0);
206            do_nibble!(1);
207            do_nibble!(2);
208            do_nibble!(3);
209            do_nibble!(4);
210            do_nibble!(5);
211            do_nibble!(6);
212            do_nibble!(7);
213            do_nibble!(8);
214            do_nibble!(9);
215            do_nibble!(10);
216            do_nibble!(11);
217            do_nibble!(12);
218            do_nibble!(13);
219            do_nibble!(14);
220            do_nibble!(15);
221
222            _mm256_storeu_si256(ptr.add(i) as *mut _, result);
223            i += 32;
224        }
225
226        // SSE/SSSE3 tail for remaining 16-byte chunk
227        if i + 16 <= len {
228            let lo_mask128 = _mm_set1_epi8(0x0F);
229
230            // Build 128-bit LUTs (just the lower lane of each 256-bit LUT)
231            let mut lut128 = [_mm_setzero_si128(); 16];
232            for h in 0u8..16 {
233                lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
234            }
235
236            let input = _mm_loadu_si128(ptr.add(i) as *const _);
237            let lo_nib = _mm_and_si128(input, lo_mask128);
238            let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
239
240            let mut res = _mm_setzero_si128();
241            macro_rules! do_nibble128 {
242                ($h:expr) => {
243                    let h_val = _mm_set1_epi8($h as i8);
244                    let mask = _mm_cmpeq_epi8(hi_nib, h_val);
245                    let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
246                    res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
247                };
248            }
249            do_nibble128!(0);
250            do_nibble128!(1);
251            do_nibble128!(2);
252            do_nibble128!(3);
253            do_nibble128!(4);
254            do_nibble128!(5);
255            do_nibble128!(6);
256            do_nibble128!(7);
257            do_nibble128!(8);
258            do_nibble128!(9);
259            do_nibble128!(10);
260            do_nibble128!(11);
261            do_nibble128!(12);
262            do_nibble128!(13);
263            do_nibble128!(14);
264            do_nibble128!(15);
265
266            _mm_storeu_si128(ptr.add(i) as *mut _, res);
267            i += 16;
268        }
269
270        // Scalar tail
271        while i < len {
272            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
273            i += 1;
274        }
275    }
276}
277
278#[cfg(target_arch = "x86_64")]
279#[target_feature(enable = "ssse3")]
280unsafe fn translate_inplace_ssse3_table(data: &mut [u8], table: &[u8; 256]) {
281    use std::arch::x86_64::*;
282
283    unsafe {
284        let len = data.len();
285        let ptr = data.as_mut_ptr();
286
287        // Pre-build 16 lookup vectors for pshufb
288        let mut lut = [_mm_setzero_si128(); 16];
289        for h in 0u8..16 {
290            let base = (h as usize) * 16;
291            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
292            lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
293        }
294
295        let lo_mask = _mm_set1_epi8(0x0F);
296
297        let mut i = 0;
298        while i + 16 <= len {
299            let input = _mm_loadu_si128(ptr.add(i) as *const _);
300            let lo_nibble = _mm_and_si128(input, lo_mask);
301            let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
302
303            let mut result = _mm_setzero_si128();
304
305            macro_rules! do_nibble {
306                ($h:expr) => {
307                    let h_val = _mm_set1_epi8($h as i8);
308                    let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
309                    let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
310                    result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
311                };
312            }
313            do_nibble!(0);
314            do_nibble!(1);
315            do_nibble!(2);
316            do_nibble!(3);
317            do_nibble!(4);
318            do_nibble!(5);
319            do_nibble!(6);
320            do_nibble!(7);
321            do_nibble!(8);
322            do_nibble!(9);
323            do_nibble!(10);
324            do_nibble!(11);
325            do_nibble!(12);
326            do_nibble!(13);
327            do_nibble!(14);
328            do_nibble!(15);
329
330            _mm_storeu_si128(ptr.add(i) as *mut _, result);
331            i += 16;
332        }
333
334        // Scalar tail
335        while i < len {
336            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
337            i += 1;
338        }
339    }
340}
341
342/// Translate bytes from source to destination using a 256-byte lookup table.
343/// On x86_64 with SSSE3+, uses SIMD pshufb-based nibble decomposition.
344#[inline(always)]
345fn translate_to(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
346    debug_assert!(dst.len() >= src.len());
347    #[cfg(target_arch = "x86_64")]
348    {
349        if is_x86_feature_detected!("avx2") {
350            unsafe { translate_to_avx2_table(src, dst, table) };
351            return;
352        }
353        if is_x86_feature_detected!("ssse3") {
354            unsafe { translate_to_ssse3_table(src, dst, table) };
355            return;
356        }
357    }
358    translate_to_scalar(src, dst, table);
359}
360
361/// Scalar fallback for translate_to.
362#[inline(always)]
363fn translate_to_scalar(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
364    unsafe {
365        let sp = src.as_ptr();
366        let dp = dst.as_mut_ptr();
367        let len = src.len();
368        let mut i = 0;
369        while i + 8 <= len {
370            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
371            *dp.add(i + 1) = *table.get_unchecked(*sp.add(i + 1) as usize);
372            *dp.add(i + 2) = *table.get_unchecked(*sp.add(i + 2) as usize);
373            *dp.add(i + 3) = *table.get_unchecked(*sp.add(i + 3) as usize);
374            *dp.add(i + 4) = *table.get_unchecked(*sp.add(i + 4) as usize);
375            *dp.add(i + 5) = *table.get_unchecked(*sp.add(i + 5) as usize);
376            *dp.add(i + 6) = *table.get_unchecked(*sp.add(i + 6) as usize);
377            *dp.add(i + 7) = *table.get_unchecked(*sp.add(i + 7) as usize);
378            i += 8;
379        }
380        while i < len {
381            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
382            i += 1;
383        }
384    }
385}
386
387#[cfg(target_arch = "x86_64")]
388#[target_feature(enable = "avx2")]
389unsafe fn translate_to_avx2_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
390    use std::arch::x86_64::*;
391
392    unsafe {
393        let len = src.len();
394        let sp = src.as_ptr();
395        let dp = dst.as_mut_ptr();
396
397        // Pre-build 16 lookup vectors
398        let mut lut = [_mm256_setzero_si256(); 16];
399        for h in 0u8..16 {
400            let base = (h as usize) * 16;
401            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
402            let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
403            lut[h as usize] = _mm256_broadcastsi128_si256(row128);
404        }
405
406        let lo_mask = _mm256_set1_epi8(0x0F);
407
408        let mut i = 0;
409        while i + 32 <= len {
410            let input = _mm256_loadu_si256(sp.add(i) as *const _);
411            let lo_nibble = _mm256_and_si256(input, lo_mask);
412            let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
413
414            let mut result = _mm256_setzero_si256();
415
416            macro_rules! do_nibble {
417                ($h:expr) => {
418                    let h_val = _mm256_set1_epi8($h as i8);
419                    let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
420                    let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
421                    result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
422                };
423            }
424            do_nibble!(0);
425            do_nibble!(1);
426            do_nibble!(2);
427            do_nibble!(3);
428            do_nibble!(4);
429            do_nibble!(5);
430            do_nibble!(6);
431            do_nibble!(7);
432            do_nibble!(8);
433            do_nibble!(9);
434            do_nibble!(10);
435            do_nibble!(11);
436            do_nibble!(12);
437            do_nibble!(13);
438            do_nibble!(14);
439            do_nibble!(15);
440
441            _mm256_storeu_si256(dp.add(i) as *mut _, result);
442            i += 32;
443        }
444
445        // SSSE3 tail for remaining 16-byte chunk
446        if i + 16 <= len {
447            let lo_mask128 = _mm_set1_epi8(0x0F);
448            let mut lut128 = [_mm_setzero_si128(); 16];
449            for h in 0u8..16 {
450                lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
451            }
452
453            let input = _mm_loadu_si128(sp.add(i) as *const _);
454            let lo_nib = _mm_and_si128(input, lo_mask128);
455            let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
456
457            let mut res = _mm_setzero_si128();
458            macro_rules! do_nibble128 {
459                ($h:expr) => {
460                    let h_val = _mm_set1_epi8($h as i8);
461                    let mask = _mm_cmpeq_epi8(hi_nib, h_val);
462                    let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
463                    res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
464                };
465            }
466            do_nibble128!(0);
467            do_nibble128!(1);
468            do_nibble128!(2);
469            do_nibble128!(3);
470            do_nibble128!(4);
471            do_nibble128!(5);
472            do_nibble128!(6);
473            do_nibble128!(7);
474            do_nibble128!(8);
475            do_nibble128!(9);
476            do_nibble128!(10);
477            do_nibble128!(11);
478            do_nibble128!(12);
479            do_nibble128!(13);
480            do_nibble128!(14);
481            do_nibble128!(15);
482
483            _mm_storeu_si128(dp.add(i) as *mut _, res);
484            i += 16;
485        }
486
487        // Scalar tail
488        while i < len {
489            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
490            i += 1;
491        }
492    }
493}
494
495#[cfg(target_arch = "x86_64")]
496#[target_feature(enable = "ssse3")]
497unsafe fn translate_to_ssse3_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
498    use std::arch::x86_64::*;
499
500    unsafe {
501        let len = src.len();
502        let sp = src.as_ptr();
503        let dp = dst.as_mut_ptr();
504
505        let mut lut = [_mm_setzero_si128(); 16];
506        for h in 0u8..16 {
507            let base = (h as usize) * 16;
508            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
509            lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
510        }
511
512        let lo_mask = _mm_set1_epi8(0x0F);
513
514        let mut i = 0;
515        while i + 16 <= len {
516            let input = _mm_loadu_si128(sp.add(i) as *const _);
517            let lo_nibble = _mm_and_si128(input, lo_mask);
518            let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
519
520            let mut result = _mm_setzero_si128();
521
522            macro_rules! do_nibble {
523                ($h:expr) => {
524                    let h_val = _mm_set1_epi8($h as i8);
525                    let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
526                    let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
527                    result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
528                };
529            }
530            do_nibble!(0);
531            do_nibble!(1);
532            do_nibble!(2);
533            do_nibble!(3);
534            do_nibble!(4);
535            do_nibble!(5);
536            do_nibble!(6);
537            do_nibble!(7);
538            do_nibble!(8);
539            do_nibble!(9);
540            do_nibble!(10);
541            do_nibble!(11);
542            do_nibble!(12);
543            do_nibble!(13);
544            do_nibble!(14);
545            do_nibble!(15);
546
547            _mm_storeu_si128(dp.add(i) as *mut _, result);
548            i += 16;
549        }
550
551        // Scalar tail
552        while i < len {
553            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
554            i += 1;
555        }
556    }
557}
558
559// ============================================================================
560// SIMD range translation (x86_64)
561// ============================================================================
562
563/// Detect if the translate table is a single contiguous range with constant offset.
564/// Returns Some((lo, hi, offset)) if all non-identity entries form [lo..=hi] with
565/// table[i] = i + offset for all i in [lo, hi].
566#[inline]
567fn detect_range_offset(table: &[u8; 256]) -> Option<(u8, u8, i8)> {
568    let mut lo: Option<u8> = None;
569    let mut hi = 0u8;
570    let mut offset = 0i16;
571
572    for i in 0..256 {
573        if table[i] != i as u8 {
574            let diff = table[i] as i16 - i as i16;
575            match lo {
576                None => {
577                    lo = Some(i as u8);
578                    hi = i as u8;
579                    offset = diff;
580                }
581                Some(_) => {
582                    if diff != offset || i as u8 != hi.wrapping_add(1) {
583                        return None;
584                    }
585                    hi = i as u8;
586                }
587            }
588        }
589    }
590
591    lo.map(|l| (l, hi, offset as i8))
592}
593
594/// SIMD-accelerated range translation for mmap'd data.
595/// For tables where only a contiguous range [lo..=hi] is translated by a constant offset,
596/// uses AVX2 (32 bytes/iter) or SSE2 (16 bytes/iter) vectorized arithmetic.
597#[cfg(target_arch = "x86_64")]
598fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
599    if is_x86_feature_detected!("avx2") {
600        unsafe { translate_range_avx2(src, dst, lo, hi, offset) };
601    } else {
602        unsafe { translate_range_sse2(src, dst, lo, hi, offset) };
603    }
604}
605
606#[cfg(target_arch = "x86_64")]
607#[target_feature(enable = "avx2")]
608unsafe fn translate_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
609    use std::arch::x86_64::*;
610
611    unsafe {
612        let range = hi - lo;
613        // Bias: shift range so lo maps to -128 (signed min).
614        // For input in [lo, hi]: biased = input + (0x80 - lo) is in [-128, -128+range].
615        // For input < lo: biased wraps to large positive (signed), > threshold.
616        // For input > hi: biased > -128+range, > threshold.
617        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
618        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
619        let offset_v = _mm256_set1_epi8(offset);
620        let zero = _mm256_setzero_si256();
621
622        let len = src.len();
623        let mut i = 0;
624
625        while i + 32 <= len {
626            let input = _mm256_loadu_si256(src.as_ptr().add(i) as *const _);
627            let biased = _mm256_add_epi8(input, bias_v);
628            // gt = 0xFF where biased > threshold (OUT of range)
629            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
630            // mask = 0xFF where IN range (NOT gt)
631            let mask = _mm256_cmpeq_epi8(gt, zero);
632            let offset_masked = _mm256_and_si256(mask, offset_v);
633            let result = _mm256_add_epi8(input, offset_masked);
634            _mm256_storeu_si256(dst.as_mut_ptr().add(i) as *mut _, result);
635            i += 32;
636        }
637
638        // SSE2 tail for 16-byte remainder
639        if i + 16 <= len {
640            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
641            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
642            let offset_v128 = _mm_set1_epi8(offset);
643            let zero128 = _mm_setzero_si128();
644
645            let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
646            let biased = _mm_add_epi8(input, bias_v128);
647            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
648            let mask = _mm_cmpeq_epi8(gt, zero128);
649            let offset_masked = _mm_and_si128(mask, offset_v128);
650            let result = _mm_add_epi8(input, offset_masked);
651            _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
652            i += 16;
653        }
654
655        // Scalar tail
656        while i < len {
657            let b = *src.get_unchecked(i);
658            *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
659                b.wrapping_add(offset as u8)
660            } else {
661                b
662            };
663            i += 1;
664        }
665    }
666}
667
668#[cfg(target_arch = "x86_64")]
669#[target_feature(enable = "sse2")]
670unsafe fn translate_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
671    use std::arch::x86_64::*;
672
673    unsafe {
674        let range = hi - lo;
675        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
676        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
677        let offset_v = _mm_set1_epi8(offset);
678        let zero = _mm_setzero_si128();
679
680        let len = src.len();
681        let mut i = 0;
682
683        while i + 16 <= len {
684            let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
685            let biased = _mm_add_epi8(input, bias_v);
686            let gt = _mm_cmpgt_epi8(biased, threshold_v);
687            let mask = _mm_cmpeq_epi8(gt, zero);
688            let offset_masked = _mm_and_si128(mask, offset_v);
689            let result = _mm_add_epi8(input, offset_masked);
690            _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
691            i += 16;
692        }
693
694        while i < len {
695            let b = *src.get_unchecked(i);
696            *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
697                b.wrapping_add(offset as u8)
698            } else {
699                b
700            };
701            i += 1;
702        }
703    }
704}
705
706/// Scalar range translation fallback for non-x86_64.
707#[cfg(not(target_arch = "x86_64"))]
708fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
709    for (i, &b) in src.iter().enumerate() {
710        dst[i] = if b >= lo && b <= hi {
711            b.wrapping_add(offset as u8)
712        } else {
713            b
714        };
715    }
716}
717
718// ============================================================================
719// In-place SIMD range translation (saves one buffer allocation in streaming)
720// ============================================================================
721
722/// In-place SIMD-accelerated range translation.
723/// Translates bytes in [lo..=hi] by adding `offset`, leaving others unchanged.
724/// Operates on the buffer in-place, eliminating the need for a separate output buffer.
725#[cfg(target_arch = "x86_64")]
726fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
727    if is_x86_feature_detected!("avx2") {
728        unsafe { translate_range_avx2_inplace(data, lo, hi, offset) };
729    } else {
730        unsafe { translate_range_sse2_inplace(data, lo, hi, offset) };
731    }
732}
733
734#[cfg(target_arch = "x86_64")]
735#[target_feature(enable = "avx2")]
736unsafe fn translate_range_avx2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
737    use std::arch::x86_64::*;
738
739    unsafe {
740        let range = hi - lo;
741        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
742        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
743        let offset_v = _mm256_set1_epi8(offset);
744        let zero = _mm256_setzero_si256();
745
746        let len = data.len();
747        let ptr = data.as_mut_ptr();
748        let mut i = 0;
749
750        while i + 32 <= len {
751            let input = _mm256_loadu_si256(ptr.add(i) as *const _);
752            let biased = _mm256_add_epi8(input, bias_v);
753            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
754            let mask = _mm256_cmpeq_epi8(gt, zero);
755            let offset_masked = _mm256_and_si256(mask, offset_v);
756            let result = _mm256_add_epi8(input, offset_masked);
757            _mm256_storeu_si256(ptr.add(i) as *mut _, result);
758            i += 32;
759        }
760
761        if i + 16 <= len {
762            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
763            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
764            let offset_v128 = _mm_set1_epi8(offset);
765            let zero128 = _mm_setzero_si128();
766
767            let input = _mm_loadu_si128(ptr.add(i) as *const _);
768            let biased = _mm_add_epi8(input, bias_v128);
769            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
770            let mask = _mm_cmpeq_epi8(gt, zero128);
771            let offset_masked = _mm_and_si128(mask, offset_v128);
772            let result = _mm_add_epi8(input, offset_masked);
773            _mm_storeu_si128(ptr.add(i) as *mut _, result);
774            i += 16;
775        }
776
777        while i < len {
778            let b = *ptr.add(i);
779            *ptr.add(i) = if b >= lo && b <= hi {
780                b.wrapping_add(offset as u8)
781            } else {
782                b
783            };
784            i += 1;
785        }
786    }
787}
788
789#[cfg(target_arch = "x86_64")]
790#[target_feature(enable = "sse2")]
791unsafe fn translate_range_sse2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
792    use std::arch::x86_64::*;
793
794    unsafe {
795        let range = hi - lo;
796        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
797        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
798        let offset_v = _mm_set1_epi8(offset);
799        let zero = _mm_setzero_si128();
800
801        let len = data.len();
802        let ptr = data.as_mut_ptr();
803        let mut i = 0;
804
805        while i + 16 <= len {
806            let input = _mm_loadu_si128(ptr.add(i) as *const _);
807            let biased = _mm_add_epi8(input, bias_v);
808            let gt = _mm_cmpgt_epi8(biased, threshold_v);
809            let mask = _mm_cmpeq_epi8(gt, zero);
810            let offset_masked = _mm_and_si128(mask, offset_v);
811            let result = _mm_add_epi8(input, offset_masked);
812            _mm_storeu_si128(ptr.add(i) as *mut _, result);
813            i += 16;
814        }
815
816        while i < len {
817            let b = *ptr.add(i);
818            *ptr.add(i) = if b >= lo && b <= hi {
819                b.wrapping_add(offset as u8)
820            } else {
821                b
822            };
823            i += 1;
824        }
825    }
826}
827
828#[cfg(not(target_arch = "x86_64"))]
829fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
830    for b in data.iter_mut() {
831        if *b >= lo && *b <= hi {
832            *b = b.wrapping_add(offset as u8);
833        }
834    }
835}
836
837// ============================================================================
838// SIMD range deletion (x86_64)
839// ============================================================================
840
841/// Detect if ALL delete characters form a single contiguous byte range [lo..=hi].
842/// Returns Some((lo, hi)) if so. This is true for common classes:
843/// - `[:digit:]` = 0x30..=0x39
844/// - `a-z` = 0x61..=0x7A
845/// - `A-Z` = 0x41..=0x5A
846#[inline]
847fn detect_delete_range(chars: &[u8]) -> Option<(u8, u8)> {
848    if chars.is_empty() {
849        return None;
850    }
851    let mut lo = chars[0];
852    let mut hi = chars[0];
853    for &c in &chars[1..] {
854        if c < lo {
855            lo = c;
856        }
857        if c > hi {
858            hi = c;
859        }
860    }
861    // Check that the range size matches the number of chars (no gaps)
862    if (hi - lo + 1) as usize == chars.len() {
863        Some((lo, hi))
864    } else {
865        None
866    }
867}
868
869/// SIMD-accelerated delete for contiguous byte ranges.
870/// Uses the same bias+threshold trick as range translate to identify bytes in [lo..=hi],
871/// then compacts output by skipping matched bytes.
872#[cfg(target_arch = "x86_64")]
873fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
874    if is_x86_feature_detected!("avx2") {
875        unsafe { delete_range_avx2(src, dst, lo, hi) }
876    } else {
877        unsafe { delete_range_sse2(src, dst, lo, hi) }
878    }
879}
880
881#[cfg(target_arch = "x86_64")]
882#[target_feature(enable = "avx2")]
883unsafe fn delete_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
884    use std::arch::x86_64::*;
885
886    unsafe {
887        let range = hi - lo;
888        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
889        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
890        let zero = _mm256_setzero_si256();
891
892        let len = src.len();
893        let sp = src.as_ptr();
894        let dp = dst.as_mut_ptr();
895        let mut ri = 0;
896        let mut wp = 0;
897
898        while ri + 32 <= len {
899            let input = _mm256_loadu_si256(sp.add(ri) as *const _);
900            let biased = _mm256_add_epi8(input, bias_v);
901            // gt = 0xFF where biased > threshold (OUT of range = KEEP)
902            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
903            // in_range = 0xFF where IN range (to DELETE), 0 where to KEEP
904            let in_range = _mm256_cmpeq_epi8(gt, zero);
905            // keep_mask bits: 1 = keep (NOT in range)
906            let keep_mask = !(_mm256_movemask_epi8(in_range) as u32);
907
908            if keep_mask == 0xFFFFFFFF {
909                // All 32 bytes are kept — bulk copy
910                std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 32);
911                wp += 32;
912            } else if keep_mask != 0 {
913                // Partial keep — process each 8-byte lane with popcnt
914                compact_8bytes(sp.add(ri), dp.add(wp), keep_mask as u8);
915                let c0 = (keep_mask as u8).count_ones() as usize;
916                compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), (keep_mask >> 8) as u8);
917                let c1 = ((keep_mask >> 8) as u8).count_ones() as usize;
918                compact_8bytes(
919                    sp.add(ri + 16),
920                    dp.add(wp + c0 + c1),
921                    (keep_mask >> 16) as u8,
922                );
923                let c2 = ((keep_mask >> 16) as u8).count_ones() as usize;
924                compact_8bytes(
925                    sp.add(ri + 24),
926                    dp.add(wp + c0 + c1 + c2),
927                    (keep_mask >> 24) as u8,
928                );
929                let c3 = ((keep_mask >> 24) as u8).count_ones() as usize;
930                wp += c0 + c1 + c2 + c3;
931            }
932            // else: keep_mask == 0 means all bytes deleted, skip entirely
933            ri += 32;
934        }
935
936        // SSE2 tail for 16-byte remainder
937        if ri + 16 <= len {
938            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
939            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
940            let zero128 = _mm_setzero_si128();
941
942            let input = _mm_loadu_si128(sp.add(ri) as *const _);
943            let biased = _mm_add_epi8(input, bias_v128);
944            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
945            let in_range = _mm_cmpeq_epi8(gt, zero128);
946            let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
947
948            if keep_mask == 0xFFFF {
949                std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 16);
950                wp += 16;
951            } else if keep_mask != 0 {
952                compact_8bytes(sp.add(ri), dp.add(wp), keep_mask as u8);
953                let c0 = (keep_mask as u8).count_ones() as usize;
954                compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), (keep_mask >> 8) as u8);
955                wp += c0 + ((keep_mask >> 8) as u8).count_ones() as usize;
956            }
957            ri += 16;
958        }
959
960        // Scalar tail
961        while ri < len {
962            let b = *sp.add(ri);
963            if b < lo || b > hi {
964                *dp.add(wp) = b;
965                wp += 1;
966            }
967            ri += 1;
968        }
969
970        wp
971    }
972}
973
974/// Compact 8 source bytes into contiguous output bytes using a keep mask.
975/// Each bit in `mask` indicates whether the corresponding byte should be kept.
976/// Uses branchless writes: always writes 8 bytes (the extras are harmless since
977/// the caller tracks the write pointer by popcount).
978#[cfg(target_arch = "x86_64")]
979#[inline(always)]
980unsafe fn compact_8bytes(src: *const u8, dst: *mut u8, mask: u8) {
981    // For each set bit position in the mask, copy that byte from src to the
982    // next output position. Using a tight trailing_zeros loop is faster than
983    // a lookup table for 8-bit masks because it's branch-predicted well
984    // and avoids cache misses on the table.
985    unsafe {
986        let mut m = mask;
987        let mut w = 0;
988        while m != 0 {
989            let bit = m.trailing_zeros() as usize;
990            *dst.add(w) = *src.add(bit);
991            w += 1;
992            m &= m - 1;
993        }
994    }
995}
996
997#[cfg(target_arch = "x86_64")]
998#[target_feature(enable = "sse2")]
999unsafe fn delete_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
1000    use std::arch::x86_64::*;
1001
1002    unsafe {
1003        let range = hi - lo;
1004        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1005        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1006        let zero = _mm_setzero_si128();
1007
1008        let len = src.len();
1009        let sp = src.as_ptr();
1010        let dp = dst.as_mut_ptr();
1011        let mut ri = 0;
1012        let mut wp = 0;
1013
1014        while ri + 16 <= len {
1015            let input = _mm_loadu_si128(sp.add(ri) as *const _);
1016            let biased = _mm_add_epi8(input, bias_v);
1017            let gt = _mm_cmpgt_epi8(biased, threshold_v);
1018            let in_range = _mm_cmpeq_epi8(gt, zero);
1019            let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
1020
1021            if keep_mask == 0xFFFF {
1022                // All 16 bytes kept — bulk copy
1023                std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 16);
1024                wp += 16;
1025            } else if keep_mask != 0 {
1026                compact_8bytes(sp.add(ri), dp.add(wp), keep_mask as u8);
1027                let c0 = (keep_mask as u8).count_ones() as usize;
1028                compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), (keep_mask >> 8) as u8);
1029                wp += c0 + ((keep_mask >> 8) as u8).count_ones() as usize;
1030            }
1031            ri += 16;
1032        }
1033
1034        while ri < len {
1035            let b = *sp.add(ri);
1036            if b < lo || b > hi {
1037                *dp.add(wp) = b;
1038                wp += 1;
1039            }
1040            ri += 1;
1041        }
1042
1043        wp
1044    }
1045}
1046
1047/// Scalar range delete fallback for non-x86_64.
1048#[cfg(not(target_arch = "x86_64"))]
1049fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
1050    let mut wp = 0;
1051    for &b in src {
1052        if b < lo || b > hi {
1053            dst[wp] = b;
1054            wp += 1;
1055        }
1056    }
1057    wp
1058}
1059
1060/// Streaming delete for contiguous byte ranges using SIMD range detection.
1061/// Uses 4MB buffer to reduce syscalls (delete is compute-light, I/O bound).
1062/// When no bytes are deleted from a chunk (common for data with few matches),
1063/// writes directly from the source buffer to avoid the copy overhead.
1064fn delete_range_streaming(
1065    lo: u8,
1066    hi: u8,
1067    reader: &mut impl Read,
1068    writer: &mut impl Write,
1069) -> io::Result<()> {
1070    let mut src = vec![0u8; STREAM_BUF];
1071    let mut dst = alloc_uninit_vec(STREAM_BUF);
1072    loop {
1073        let n = read_full(reader, &mut src)?;
1074        if n == 0 {
1075            break;
1076        }
1077        let wp = delete_range_chunk(&src[..n], &mut dst, lo, hi);
1078        if wp == n {
1079            // No bytes deleted — write source directly (avoids copy overhead)
1080            writer.write_all(&src[..n])?;
1081        } else if wp > 0 {
1082            writer.write_all(&dst[..wp])?;
1083        }
1084    }
1085    Ok(())
1086}
1087
1088// ============================================================================
1089// Streaming functions (Read + Write)
1090// ============================================================================
1091
1092pub fn translate(
1093    set1: &[u8],
1094    set2: &[u8],
1095    reader: &mut impl Read,
1096    writer: &mut impl Write,
1097) -> io::Result<()> {
1098    let table = build_translate_table(set1, set2);
1099
1100    // Check for identity table — pure passthrough (no transformation needed)
1101    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
1102    if is_identity {
1103        return passthrough_stream(reader, writer);
1104    }
1105
1106    // Try SIMD fast path for range translations (in-place, single buffer)
1107    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
1108        return translate_range_stream(lo, hi, offset, reader, writer);
1109    }
1110
1111    // General case: IN-PLACE translation on a SINGLE 4MB buffer.
1112    // This halves memory bandwidth vs the old separate src/dst approach:
1113    // - Old: read into src (4MB), translate from src→dst (read 4MB + write 4MB), write dst (4MB) = 12MB
1114    // - New: read into buf (4MB), translate in-place (read+write 4MB), write buf (4MB) = 8MB
1115    // The 8x-unrolled in-place translate avoids store-to-load forwarding stalls
1116    // because consecutive reads are 8 bytes apart (sequential), not aliased.
1117    // Using 4MB buffer (vs 1MB) reduces syscall count from 10 to 3 for 10MB.
1118    let mut buf = vec![0u8; STREAM_BUF];
1119    loop {
1120        let n = read_full(reader, &mut buf)?;
1121        if n == 0 {
1122            break;
1123        }
1124        translate_inplace(&mut buf[..n], &table);
1125        writer.write_all(&buf[..n])?;
1126    }
1127    Ok(())
1128}
1129
1130/// Streaming SIMD range translation — single buffer, in-place transform.
1131/// Uses 4MB buffer for fewer syscalls (translate is compute-light).
1132fn translate_range_stream(
1133    lo: u8,
1134    hi: u8,
1135    offset: i8,
1136    reader: &mut impl Read,
1137    writer: &mut impl Write,
1138) -> io::Result<()> {
1139    let mut buf = vec![0u8; STREAM_BUF];
1140    loop {
1141        let n = read_full(reader, &mut buf)?;
1142        if n == 0 {
1143            break;
1144        }
1145        translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
1146        writer.write_all(&buf[..n])?;
1147    }
1148    Ok(())
1149}
1150
1151/// Pure passthrough: copy stdin to stdout without transformation.
1152/// Uses a single 4MB buffer with direct read/write, no processing overhead.
1153fn passthrough_stream(reader: &mut impl Read, writer: &mut impl Write) -> io::Result<()> {
1154    let mut buf = vec![0u8; STREAM_BUF];
1155    loop {
1156        let n = read_full(reader, &mut buf)?;
1157        if n == 0 {
1158            break;
1159        }
1160        writer.write_all(&buf[..n])?;
1161    }
1162    Ok(())
1163}
1164
1165/// Read as many bytes as possible into buf, retrying on partial reads.
1166/// Fast path: first read() often fills the entire buffer for regular files.
1167#[inline]
1168fn read_full(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
1169    // Fast path: first read() usually fills the entire buffer for regular files
1170    let n = reader.read(buf)?;
1171    if n == buf.len() || n == 0 {
1172        return Ok(n);
1173    }
1174    // Slow path: partial read — retry to fill buffer (pipes, slow devices)
1175    let mut total = n;
1176    while total < buf.len() {
1177        match reader.read(&mut buf[total..]) {
1178            Ok(0) => break,
1179            Ok(n) => total += n,
1180            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
1181            Err(e) => return Err(e),
1182        }
1183    }
1184    Ok(total)
1185}
1186
1187pub fn translate_squeeze(
1188    set1: &[u8],
1189    set2: &[u8],
1190    reader: &mut impl Read,
1191    writer: &mut impl Write,
1192) -> io::Result<()> {
1193    let table = build_translate_table(set1, set2);
1194    let squeeze_set = build_member_set(set2);
1195
1196    // Two-pass optimization for range translations:
1197    // Pass 1: SIMD range translate in-place (10x faster than scalar table lookup)
1198    // Pass 2: scalar squeeze (inherently sequential due to state dependency)
1199    // Even though it's two passes, the translate pass is so much faster with SIMD
1200    // that the total is still a net win.
1201    let range_info = detect_range_offset(&table);
1202
1203    let mut buf = vec![0u8; STREAM_BUF];
1204    let mut last_squeezed: u16 = 256;
1205
1206    loop {
1207        let n = read_full(reader, &mut buf)?;
1208        if n == 0 {
1209            break;
1210        }
1211        // Pass 1: translate
1212        if let Some((lo, hi, offset)) = range_info {
1213            translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
1214        } else {
1215            translate_inplace(&mut buf[..n], &table);
1216        }
1217        // Pass 2: squeeze in-place
1218        let mut wp = 0;
1219        unsafe {
1220            let ptr = buf.as_mut_ptr();
1221            for i in 0..n {
1222                let b = *ptr.add(i);
1223                if is_member(&squeeze_set, b) {
1224                    if last_squeezed == b as u16 {
1225                        continue;
1226                    }
1227                    last_squeezed = b as u16;
1228                } else {
1229                    last_squeezed = 256;
1230                }
1231                *ptr.add(wp) = b;
1232                wp += 1;
1233            }
1234        }
1235        writer.write_all(&buf[..wp])?;
1236    }
1237    Ok(())
1238}
1239
1240pub fn delete(
1241    delete_chars: &[u8],
1242    reader: &mut impl Read,
1243    writer: &mut impl Write,
1244) -> io::Result<()> {
1245    if delete_chars.len() == 1 {
1246        return delete_single_streaming(delete_chars[0], reader, writer);
1247    }
1248    if delete_chars.len() <= 3 {
1249        return delete_multi_streaming(delete_chars, reader, writer);
1250    }
1251
1252    // SIMD fast path: if all delete chars form a contiguous range [lo..=hi],
1253    // use vectorized range comparison instead of scalar bitset lookup.
1254    // This covers [:digit:] (0x30-0x39), a-z, A-Z, etc.
1255    if let Some((lo, hi)) = detect_delete_range(delete_chars) {
1256        return delete_range_streaming(lo, hi, reader, writer);
1257    }
1258
1259    let member = build_member_set(delete_chars);
1260    let mut buf = vec![0u8; STREAM_BUF];
1261
1262    loop {
1263        let n = read_full(reader, &mut buf)?;
1264        if n == 0 {
1265            break;
1266        }
1267        let mut wp = 0;
1268        unsafe {
1269            let ptr = buf.as_mut_ptr();
1270            let mut i = 0;
1271            while i + 8 <= n {
1272                let b0 = *ptr.add(i);
1273                let b1 = *ptr.add(i + 1);
1274                let b2 = *ptr.add(i + 2);
1275                let b3 = *ptr.add(i + 3);
1276                let b4 = *ptr.add(i + 4);
1277                let b5 = *ptr.add(i + 5);
1278                let b6 = *ptr.add(i + 6);
1279                let b7 = *ptr.add(i + 7);
1280
1281                // Branchless: write byte then conditionally advance pointer.
1282                // Avoids branch mispredictions when most bytes are kept.
1283                *ptr.add(wp) = b0;
1284                wp += !is_member(&member, b0) as usize;
1285                *ptr.add(wp) = b1;
1286                wp += !is_member(&member, b1) as usize;
1287                *ptr.add(wp) = b2;
1288                wp += !is_member(&member, b2) as usize;
1289                *ptr.add(wp) = b3;
1290                wp += !is_member(&member, b3) as usize;
1291                *ptr.add(wp) = b4;
1292                wp += !is_member(&member, b4) as usize;
1293                *ptr.add(wp) = b5;
1294                wp += !is_member(&member, b5) as usize;
1295                *ptr.add(wp) = b6;
1296                wp += !is_member(&member, b6) as usize;
1297                *ptr.add(wp) = b7;
1298                wp += !is_member(&member, b7) as usize;
1299                i += 8;
1300            }
1301            while i < n {
1302                let b = *ptr.add(i);
1303                *ptr.add(wp) = b;
1304                wp += !is_member(&member, b) as usize;
1305                i += 1;
1306            }
1307        }
1308        writer.write_all(&buf[..wp])?;
1309    }
1310    Ok(())
1311}
1312
1313fn delete_single_streaming(
1314    ch: u8,
1315    reader: &mut impl Read,
1316    writer: &mut impl Write,
1317) -> io::Result<()> {
1318    let mut src = vec![0u8; STREAM_BUF];
1319    let mut dst = alloc_uninit_vec(STREAM_BUF);
1320    loop {
1321        let n = read_full(reader, &mut src)?;
1322        if n == 0 {
1323            break;
1324        }
1325        // Use memchr to find byte positions, gap-copy to separate dst buffer.
1326        // Separate src/dst allows copy_nonoverlapping (faster than memmove)
1327        // and avoids aliasing concerns in the hot loop.
1328        let mut wp = 0;
1329        let mut i = 0;
1330        while i < n {
1331            match memchr::memchr(ch, &src[i..n]) {
1332                Some(offset) => {
1333                    if offset > 0 {
1334                        unsafe {
1335                            std::ptr::copy_nonoverlapping(
1336                                src.as_ptr().add(i),
1337                                dst.as_mut_ptr().add(wp),
1338                                offset,
1339                            );
1340                        }
1341                        wp += offset;
1342                    }
1343                    i += offset + 1;
1344                }
1345                None => {
1346                    let run_len = n - i;
1347                    if run_len > 0 {
1348                        unsafe {
1349                            std::ptr::copy_nonoverlapping(
1350                                src.as_ptr().add(i),
1351                                dst.as_mut_ptr().add(wp),
1352                                run_len,
1353                            );
1354                        }
1355                        wp += run_len;
1356                    }
1357                    break;
1358                }
1359            }
1360        }
1361        // If nothing was deleted, write from src directly (avoids extra copy)
1362        if wp == n {
1363            writer.write_all(&src[..n])?;
1364        } else if wp > 0 {
1365            writer.write_all(&dst[..wp])?;
1366        }
1367    }
1368    Ok(())
1369}
1370
1371fn delete_multi_streaming(
1372    chars: &[u8],
1373    reader: &mut impl Read,
1374    writer: &mut impl Write,
1375) -> io::Result<()> {
1376    let mut src = vec![0u8; STREAM_BUF];
1377    let mut dst = alloc_uninit_vec(STREAM_BUF);
1378    loop {
1379        let n = read_full(reader, &mut src)?;
1380        if n == 0 {
1381            break;
1382        }
1383        // Use memchr2/memchr3 to find byte positions, gap-copy to separate dst buffer.
1384        // Separate src/dst allows copy_nonoverlapping (faster than memmove).
1385        let mut wp = 0;
1386        let mut i = 0;
1387        while i < n {
1388            let found = if chars.len() == 2 {
1389                memchr::memchr2(chars[0], chars[1], &src[i..n])
1390            } else {
1391                memchr::memchr3(chars[0], chars[1], chars[2], &src[i..n])
1392            };
1393            match found {
1394                Some(offset) => {
1395                    if offset > 0 {
1396                        unsafe {
1397                            std::ptr::copy_nonoverlapping(
1398                                src.as_ptr().add(i),
1399                                dst.as_mut_ptr().add(wp),
1400                                offset,
1401                            );
1402                        }
1403                        wp += offset;
1404                    }
1405                    i += offset + 1;
1406                }
1407                None => {
1408                    let run_len = n - i;
1409                    if run_len > 0 {
1410                        unsafe {
1411                            std::ptr::copy_nonoverlapping(
1412                                src.as_ptr().add(i),
1413                                dst.as_mut_ptr().add(wp),
1414                                run_len,
1415                            );
1416                        }
1417                        wp += run_len;
1418                    }
1419                    break;
1420                }
1421            }
1422        }
1423        if wp == n {
1424            writer.write_all(&src[..n])?;
1425        } else if wp > 0 {
1426            writer.write_all(&dst[..wp])?;
1427        }
1428    }
1429    Ok(())
1430}
1431
1432pub fn delete_squeeze(
1433    delete_chars: &[u8],
1434    squeeze_chars: &[u8],
1435    reader: &mut impl Read,
1436    writer: &mut impl Write,
1437) -> io::Result<()> {
1438    let delete_set = build_member_set(delete_chars);
1439    let squeeze_set = build_member_set(squeeze_chars);
1440    let mut buf = vec![0u8; STREAM_BUF];
1441    let mut last_squeezed: u16 = 256;
1442
1443    loop {
1444        let n = read_full(reader, &mut buf)?;
1445        if n == 0 {
1446            break;
1447        }
1448        let mut wp = 0;
1449        unsafe {
1450            let ptr = buf.as_mut_ptr();
1451            for i in 0..n {
1452                let b = *ptr.add(i);
1453                if is_member(&delete_set, b) {
1454                    continue;
1455                }
1456                if is_member(&squeeze_set, b) {
1457                    if last_squeezed == b as u16 {
1458                        continue;
1459                    }
1460                    last_squeezed = b as u16;
1461                } else {
1462                    last_squeezed = 256;
1463                }
1464                *ptr.add(wp) = b;
1465                wp += 1;
1466            }
1467        }
1468        writer.write_all(&buf[..wp])?;
1469    }
1470    Ok(())
1471}
1472
1473pub fn squeeze(
1474    squeeze_chars: &[u8],
1475    reader: &mut impl Read,
1476    writer: &mut impl Write,
1477) -> io::Result<()> {
1478    if squeeze_chars.len() == 1 {
1479        return squeeze_single_stream(squeeze_chars[0], reader, writer);
1480    }
1481
1482    let member = build_member_set(squeeze_chars);
1483    let mut buf = vec![0u8; STREAM_BUF];
1484    let mut last_squeezed: u16 = 256;
1485
1486    loop {
1487        let n = read_full(reader, &mut buf)?;
1488        if n == 0 {
1489            break;
1490        }
1491        let mut wp = 0;
1492        unsafe {
1493            let ptr = buf.as_mut_ptr();
1494            for i in 0..n {
1495                let b = *ptr.add(i);
1496                if is_member(&member, b) {
1497                    if last_squeezed == b as u16 {
1498                        continue;
1499                    }
1500                    last_squeezed = b as u16;
1501                } else {
1502                    last_squeezed = 256;
1503                }
1504                *ptr.add(wp) = b;
1505                wp += 1;
1506            }
1507        }
1508        writer.write_all(&buf[..wp])?;
1509    }
1510    Ok(())
1511}
1512
1513fn squeeze_single_stream(
1514    ch: u8,
1515    reader: &mut impl Read,
1516    writer: &mut impl Write,
1517) -> io::Result<()> {
1518    // Use a two-byte pattern finder (ch,ch) to jump directly to squeeze points.
1519    // For squeeze-spaces: most of the data has no consecutive spaces, so memmem
1520    // skips over huge regions at SIMD speed. When a pair is found, we scan the
1521    // run length and collapse it to one occurrence.
1522    let pair = [ch, ch];
1523    let finder = memchr::memmem::Finder::new(&pair);
1524    let mut buf = vec![0u8; STREAM_BUF];
1525    let mut was_squeeze_char = false;
1526
1527    loop {
1528        let n = read_full(reader, &mut buf)?;
1529        if n == 0 {
1530            break;
1531        }
1532
1533        let ptr = buf.as_mut_ptr();
1534        let mut wp = 0;
1535        let mut i = 0;
1536
1537        // Handle carry-over: if previous chunk ended with squeeze char,
1538        // skip leading occurrences of that char in this chunk.
1539        if was_squeeze_char {
1540            while i < n && unsafe { *ptr.add(i) } == ch {
1541                i += 1;
1542            }
1543            if i >= n {
1544                // Entire chunk is squeeze-char continuation
1545                // was_squeeze_char remains true
1546                continue;
1547            }
1548        }
1549
1550        // Use memmem to find consecutive pairs (ch, ch) — SIMD-accelerated.
1551        // Between pairs, the data passes through unchanged.
1552        loop {
1553            match finder.find(&buf[i..n]) {
1554                Some(offset) => {
1555                    // Copy everything up to and including the first char of the pair
1556                    let copy_len = offset + 1; // include first ch
1557                    if copy_len > 0 && wp != i {
1558                        unsafe {
1559                            std::ptr::copy(ptr.add(i), ptr.add(wp), copy_len);
1560                        }
1561                    }
1562                    wp += copy_len;
1563                    i += offset + 1; // position after first ch of pair
1564                    // Skip all remaining consecutive ch bytes (the run)
1565                    while i < n && unsafe { *ptr.add(i) } == ch {
1566                        i += 1;
1567                    }
1568                    if i >= n {
1569                        was_squeeze_char = true;
1570                        break;
1571                    }
1572                }
1573                None => {
1574                    // No more consecutive pairs — copy remainder
1575                    let run_len = n - i;
1576                    if run_len > 0 && wp != i {
1577                        unsafe {
1578                            std::ptr::copy(ptr.add(i), ptr.add(wp), run_len);
1579                        }
1580                    }
1581                    wp += run_len;
1582                    // Check if chunk ends with the squeeze char
1583                    was_squeeze_char = n > 0 && unsafe { *ptr.add(n - 1) } == ch;
1584                    break;
1585                }
1586            }
1587        }
1588
1589        writer.write_all(&buf[..wp])?;
1590    }
1591    Ok(())
1592}
1593
1594// ============================================================================
1595// Mmap-based functions (zero-copy input from byte slice)
1596// ============================================================================
1597
1598/// Maximum data size for single-allocation translate approach.
1599/// Below this limit, translate ALL data into one buffer and do a single write_all.
1600/// Above this, use chunked approach to limit memory usage.
1601const SINGLE_WRITE_LIMIT: usize = 16 * 1024 * 1024;
1602
1603/// Translate bytes from an mmap'd byte slice.
1604/// Detects single-range translations (e.g., a-z to A-Z) and uses SIMD vectorized
1605/// arithmetic (AVX2: 32 bytes/iter, SSE2: 16 bytes/iter) for those cases.
1606/// Falls back to scalar 256-byte table lookup for general translations.
1607///
1608/// For data >= 2MB: uses rayon parallel processing across multiple cores.
1609/// For data <= 16MB: single allocation + single write_all (1 syscall).
1610/// For data > 16MB: chunked approach to limit memory (N syscalls where N = data/4MB).
1611pub fn translate_mmap(
1612    set1: &[u8],
1613    set2: &[u8],
1614    data: &[u8],
1615    writer: &mut impl Write,
1616) -> io::Result<()> {
1617    let table = build_translate_table(set1, set2);
1618
1619    // Check if table is identity — pure passthrough
1620    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
1621    if is_identity {
1622        return writer.write_all(data);
1623    }
1624
1625    // Try SIMD fast path for single-range constant-offset translations
1626    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
1627        return translate_mmap_range(data, writer, lo, hi, offset);
1628    }
1629
1630    // General case: table lookup (with parallel processing for large data)
1631    translate_mmap_table(data, writer, &table)
1632}
1633
1634/// SIMD range translate for mmap data, with rayon parallel processing.
1635fn translate_mmap_range(
1636    data: &[u8],
1637    writer: &mut impl Write,
1638    lo: u8,
1639    hi: u8,
1640    offset: i8,
1641) -> io::Result<()> {
1642    // Parallel path: split data into chunks, translate each in parallel
1643    if data.len() >= PARALLEL_THRESHOLD {
1644        let mut buf = alloc_uninit_vec(data.len());
1645        let n_threads = rayon::current_num_threads().max(1);
1646        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1647
1648        // Process chunks in parallel: each thread writes to its slice of buf
1649        data.par_chunks(chunk_size)
1650            .zip(buf.par_chunks_mut(chunk_size))
1651            .for_each(|(src_chunk, dst_chunk)| {
1652                translate_range_simd(src_chunk, &mut dst_chunk[..src_chunk.len()], lo, hi, offset);
1653            });
1654
1655        return writer.write_all(&buf);
1656    }
1657
1658    // Small data: single-threaded SIMD
1659    if data.len() <= SINGLE_WRITE_LIMIT {
1660        let mut buf = alloc_uninit_vec(data.len());
1661        translate_range_simd(data, &mut buf, lo, hi, offset);
1662        return writer.write_all(&buf);
1663    }
1664    // Chunked path for large data (shouldn't happen since PARALLEL_THRESHOLD < SINGLE_WRITE_LIMIT)
1665    let mut buf = alloc_uninit_vec(BUF_SIZE);
1666    for chunk in data.chunks(BUF_SIZE) {
1667        translate_range_simd(chunk, &mut buf[..chunk.len()], lo, hi, offset);
1668        writer.write_all(&buf[..chunk.len()])?;
1669    }
1670    Ok(())
1671}
1672
1673/// General table-lookup translate for mmap data, with rayon parallel processing.
1674fn translate_mmap_table(data: &[u8], writer: &mut impl Write, table: &[u8; 256]) -> io::Result<()> {
1675    // Parallel path: split data into chunks, translate each in parallel
1676    if data.len() >= PARALLEL_THRESHOLD {
1677        let mut buf = alloc_uninit_vec(data.len());
1678        let n_threads = rayon::current_num_threads().max(1);
1679        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1680
1681        data.par_chunks(chunk_size)
1682            .zip(buf.par_chunks_mut(chunk_size))
1683            .for_each(|(src_chunk, dst_chunk)| {
1684                translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], table);
1685            });
1686
1687        return writer.write_all(&buf);
1688    }
1689
1690    // Small data: single-threaded
1691    if data.len() <= SINGLE_WRITE_LIMIT {
1692        let mut buf = alloc_uninit_vec(data.len());
1693        translate_to(data, &mut buf, table);
1694        return writer.write_all(&buf);
1695    }
1696    let mut buf = alloc_uninit_vec(BUF_SIZE);
1697    for chunk in data.chunks(BUF_SIZE) {
1698        translate_to(chunk, &mut buf[..chunk.len()], table);
1699        writer.write_all(&buf[..chunk.len()])?;
1700    }
1701    Ok(())
1702}
1703
1704/// Translate + squeeze from mmap'd byte slice.
1705///
1706/// For data >= 2MB: two-phase approach: parallel translate, then sequential squeeze.
1707/// For data <= 16MB: single-pass translate+squeeze into one buffer, one write syscall.
1708/// For data > 16MB: chunked approach to limit memory.
1709pub fn translate_squeeze_mmap(
1710    set1: &[u8],
1711    set2: &[u8],
1712    data: &[u8],
1713    writer: &mut impl Write,
1714) -> io::Result<()> {
1715    let table = build_translate_table(set1, set2);
1716    let squeeze_set = build_member_set(set2);
1717
1718    // For large data: two-phase approach
1719    // Phase 1: parallel translate into buffer
1720    // Phase 2: sequential squeeze IN-PLACE on the translated buffer
1721    //          (squeeze only removes bytes, never grows, so no second allocation needed)
1722    if data.len() >= PARALLEL_THRESHOLD {
1723        // Phase 1: parallel translate
1724        let mut translated = alloc_uninit_vec(data.len());
1725        let range_info = detect_range_offset(&table);
1726        let n_threads = rayon::current_num_threads().max(1);
1727        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1728
1729        if let Some((lo, hi, offset)) = range_info {
1730            data.par_chunks(chunk_size)
1731                .zip(translated.par_chunks_mut(chunk_size))
1732                .for_each(|(src_chunk, dst_chunk)| {
1733                    translate_range_simd(
1734                        src_chunk,
1735                        &mut dst_chunk[..src_chunk.len()],
1736                        lo,
1737                        hi,
1738                        offset,
1739                    );
1740                });
1741        } else {
1742            data.par_chunks(chunk_size)
1743                .zip(translated.par_chunks_mut(chunk_size))
1744                .for_each(|(src_chunk, dst_chunk)| {
1745                    translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], &table);
1746                });
1747        }
1748
1749        // Phase 2: squeeze in-place on the translated buffer.
1750        // Since squeeze only removes bytes (never grows), we can read ahead and
1751        // compact into the same buffer, saving a full data.len() heap allocation.
1752        let mut last_squeezed: u16 = 256;
1753        let len = translated.len();
1754        let mut wp = 0;
1755        unsafe {
1756            let ptr = translated.as_mut_ptr();
1757            let mut i = 0;
1758            while i < len {
1759                let b = *ptr.add(i);
1760                if is_member(&squeeze_set, b) {
1761                    if last_squeezed == b as u16 {
1762                        i += 1;
1763                        continue;
1764                    }
1765                    last_squeezed = b as u16;
1766                } else {
1767                    last_squeezed = 256;
1768                }
1769                *ptr.add(wp) = b;
1770                wp += 1;
1771                i += 1;
1772            }
1773        }
1774        return writer.write_all(&translated[..wp]);
1775    }
1776
1777    // Single-write fast path: translate+squeeze all data in one pass, one write
1778    if data.len() <= SINGLE_WRITE_LIMIT {
1779        let mut buf: Vec<u8> = Vec::with_capacity(data.len());
1780        let mut last_squeezed: u16 = 256;
1781        unsafe {
1782            buf.set_len(data.len());
1783            let outp: *mut u8 = buf.as_mut_ptr();
1784            let inp = data.as_ptr();
1785            let len = data.len();
1786            let mut wp = 0;
1787            let mut i = 0;
1788            while i < len {
1789                let translated = *table.get_unchecked(*inp.add(i) as usize);
1790                if is_member(&squeeze_set, translated) {
1791                    if last_squeezed == translated as u16 {
1792                        i += 1;
1793                        continue;
1794                    }
1795                    last_squeezed = translated as u16;
1796                } else {
1797                    last_squeezed = 256;
1798                }
1799                *outp.add(wp) = translated;
1800                wp += 1;
1801                i += 1;
1802            }
1803            buf.set_len(wp);
1804        }
1805        return writer.write_all(&buf);
1806    }
1807
1808    // Chunked path for large data
1809    let buf_size = data.len().min(BUF_SIZE);
1810    let mut buf = vec![0u8; buf_size];
1811    let mut last_squeezed: u16 = 256;
1812
1813    for chunk in data.chunks(buf_size) {
1814        translate_to(chunk, &mut buf[..chunk.len()], &table);
1815        let mut wp = 0;
1816        unsafe {
1817            let ptr = buf.as_mut_ptr();
1818            for i in 0..chunk.len() {
1819                let b = *ptr.add(i);
1820                if is_member(&squeeze_set, b) {
1821                    if last_squeezed == b as u16 {
1822                        continue;
1823                    }
1824                    last_squeezed = b as u16;
1825                } else {
1826                    last_squeezed = 256;
1827                }
1828                *ptr.add(wp) = b;
1829                wp += 1;
1830            }
1831        }
1832        writer.write_all(&buf[..wp])?;
1833    }
1834    Ok(())
1835}
1836
1837/// Delete from mmap'd byte slice.
1838///
1839/// For data >= 2MB: uses rayon parallel processing across multiple cores.
1840/// For data <= 16MB: delete into one buffer, one write syscall.
1841/// For data > 16MB: chunked approach to limit memory.
1842pub fn delete_mmap(delete_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1843    if delete_chars.len() == 1 {
1844        return delete_single_char_mmap(delete_chars[0], data, writer);
1845    }
1846    if delete_chars.len() <= 3 {
1847        return delete_multi_memchr_mmap(delete_chars, data, writer);
1848    }
1849
1850    // SIMD fast path for contiguous ranges (digits, a-z, A-Z, etc.)
1851    if let Some((lo, hi)) = detect_delete_range(delete_chars) {
1852        return delete_range_mmap(data, writer, lo, hi);
1853    }
1854
1855    let member = build_member_set(delete_chars);
1856
1857    // Parallel path: pre-allocate a single output buffer of data.len() and have each
1858    // thread write to its non-overlapping slice, then do a single write_all.
1859    // This avoids per-chunk Vec allocations that the old approach had.
1860    if data.len() >= PARALLEL_THRESHOLD {
1861        let n_threads = rayon::current_num_threads().max(1);
1862        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1863
1864        // Each thread deletes into its slice of outbuf and returns bytes written.
1865        let mut outbuf = alloc_uninit_vec(data.len());
1866        let chunk_lens: Vec<usize> = data
1867            .par_chunks(chunk_size)
1868            .zip(outbuf.par_chunks_mut(chunk_size))
1869            .map(|(src_chunk, dst_chunk)| delete_chunk_bitset_into(src_chunk, &member, dst_chunk))
1870            .collect();
1871
1872        // Compact: move each chunk's output to be contiguous.
1873        // chunk_lens[i] is how many bytes thread i wrote into its slice.
1874        // We need to shift them together since each dst_chunk started at chunk_size offsets.
1875        let mut write_pos = 0;
1876        let mut src_offset = 0;
1877        for &clen in &chunk_lens {
1878            if clen > 0 && src_offset != write_pos {
1879                unsafe {
1880                    std::ptr::copy(
1881                        outbuf.as_ptr().add(src_offset),
1882                        outbuf.as_mut_ptr().add(write_pos),
1883                        clen,
1884                    );
1885                }
1886            }
1887            write_pos += clen;
1888            src_offset += chunk_size;
1889        }
1890
1891        return writer.write_all(&outbuf[..write_pos]);
1892    }
1893
1894    // Single-write fast path: delete into one buffer, one write
1895    if data.len() <= SINGLE_WRITE_LIMIT {
1896        let mut outbuf = alloc_uninit_vec(data.len());
1897        let out_pos = delete_chunk_bitset_into(data, &member, &mut outbuf);
1898        return writer.write_all(&outbuf[..out_pos]);
1899    }
1900
1901    // Chunked path for large data
1902    let buf_size = data.len().min(BUF_SIZE);
1903    let mut outbuf = alloc_uninit_vec(buf_size);
1904
1905    for chunk in data.chunks(buf_size) {
1906        let out_pos = delete_chunk_bitset_into(chunk, &member, &mut outbuf);
1907        writer.write_all(&outbuf[..out_pos])?;
1908    }
1909    Ok(())
1910}
1911
1912/// SIMD range delete for mmap data, with rayon parallel processing.
1913fn delete_range_mmap(data: &[u8], writer: &mut impl Write, lo: u8, hi: u8) -> io::Result<()> {
1914    // Parallel path: each thread deletes from its chunk into a local Vec
1915    if data.len() >= PARALLEL_THRESHOLD {
1916        let n_threads = rayon::current_num_threads().max(1);
1917        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1918
1919        let results: Vec<Vec<u8>> = data
1920            .par_chunks(chunk_size)
1921            .map(|chunk| {
1922                let mut out = alloc_uninit_vec(chunk.len());
1923                let wp = delete_range_chunk(chunk, &mut out, lo, hi);
1924                unsafe { out.set_len(wp) };
1925                out
1926            })
1927            .collect();
1928
1929        let slices: Vec<std::io::IoSlice> = results
1930            .iter()
1931            .filter(|r| !r.is_empty())
1932            .map(|r| std::io::IoSlice::new(r))
1933            .collect();
1934        return write_ioslices(writer, &slices);
1935    }
1936
1937    // Single-write fast path
1938    if data.len() <= SINGLE_WRITE_LIMIT {
1939        let mut outbuf = alloc_uninit_vec(data.len());
1940        let wp = delete_range_chunk(data, &mut outbuf, lo, hi);
1941        return writer.write_all(&outbuf[..wp]);
1942    }
1943
1944    // Chunked path
1945    let mut outbuf = alloc_uninit_vec(BUF_SIZE);
1946    for chunk in data.chunks(BUF_SIZE) {
1947        let wp = delete_range_chunk(chunk, &mut outbuf, lo, hi);
1948        writer.write_all(&outbuf[..wp])?;
1949    }
1950    Ok(())
1951}
1952
1953/// Delete bytes from chunk using bitset, writing into pre-allocated buffer.
1954/// Returns number of bytes written.
1955#[inline]
1956fn delete_chunk_bitset_into(chunk: &[u8], member: &[u8; 32], outbuf: &mut [u8]) -> usize {
1957    let len = chunk.len();
1958    let mut out_pos = 0;
1959    let mut i = 0;
1960
1961    while i + 8 <= len {
1962        unsafe {
1963            let b0 = *chunk.get_unchecked(i);
1964            let b1 = *chunk.get_unchecked(i + 1);
1965            let b2 = *chunk.get_unchecked(i + 2);
1966            let b3 = *chunk.get_unchecked(i + 3);
1967            let b4 = *chunk.get_unchecked(i + 4);
1968            let b5 = *chunk.get_unchecked(i + 5);
1969            let b6 = *chunk.get_unchecked(i + 6);
1970            let b7 = *chunk.get_unchecked(i + 7);
1971
1972            *outbuf.get_unchecked_mut(out_pos) = b0;
1973            out_pos += !is_member(member, b0) as usize;
1974            *outbuf.get_unchecked_mut(out_pos) = b1;
1975            out_pos += !is_member(member, b1) as usize;
1976            *outbuf.get_unchecked_mut(out_pos) = b2;
1977            out_pos += !is_member(member, b2) as usize;
1978            *outbuf.get_unchecked_mut(out_pos) = b3;
1979            out_pos += !is_member(member, b3) as usize;
1980            *outbuf.get_unchecked_mut(out_pos) = b4;
1981            out_pos += !is_member(member, b4) as usize;
1982            *outbuf.get_unchecked_mut(out_pos) = b5;
1983            out_pos += !is_member(member, b5) as usize;
1984            *outbuf.get_unchecked_mut(out_pos) = b6;
1985            out_pos += !is_member(member, b6) as usize;
1986            *outbuf.get_unchecked_mut(out_pos) = b7;
1987            out_pos += !is_member(member, b7) as usize;
1988        }
1989        i += 8;
1990    }
1991
1992    while i < len {
1993        unsafe {
1994            let b = *chunk.get_unchecked(i);
1995            *outbuf.get_unchecked_mut(out_pos) = b;
1996            out_pos += !is_member(member, b) as usize;
1997        }
1998        i += 1;
1999    }
2000
2001    out_pos
2002}
2003
2004fn delete_single_char_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2005    // Parallel path for large data: each thread deletes from its chunk,
2006    // then use writev to write all results in one syscall batch.
2007    if data.len() >= PARALLEL_THRESHOLD {
2008        let n_threads = rayon::current_num_threads().max(1);
2009        let chunk_size = (data.len() / n_threads).max(32 * 1024);
2010
2011        let results: Vec<Vec<u8>> = data
2012            .par_chunks(chunk_size)
2013            .map(|chunk| {
2014                let mut out = Vec::with_capacity(chunk.len());
2015                let mut last = 0;
2016                for pos in memchr::memchr_iter(ch, chunk) {
2017                    if pos > last {
2018                        out.extend_from_slice(&chunk[last..pos]);
2019                    }
2020                    last = pos + 1;
2021                }
2022                if last < chunk.len() {
2023                    out.extend_from_slice(&chunk[last..]);
2024                }
2025                out
2026            })
2027            .collect();
2028
2029        // Use writev to batch all results into fewer syscalls
2030        let slices: Vec<std::io::IoSlice> = results
2031            .iter()
2032            .filter(|r| !r.is_empty())
2033            .map(|r| std::io::IoSlice::new(r))
2034            .collect();
2035        return write_ioslices(writer, &slices);
2036    }
2037
2038    // Single-write fast path: collect all non-deleted spans into one buffer
2039    if data.len() <= SINGLE_WRITE_LIMIT {
2040        let mut outbuf = Vec::with_capacity(data.len());
2041        let mut last = 0;
2042        for pos in memchr::memchr_iter(ch, data) {
2043            if pos > last {
2044                outbuf.extend_from_slice(&data[last..pos]);
2045            }
2046            last = pos + 1;
2047        }
2048        if last < data.len() {
2049            outbuf.extend_from_slice(&data[last..]);
2050        }
2051        return writer.write_all(&outbuf);
2052    }
2053
2054    // Chunked path for large data
2055    let buf_size = data.len().min(BUF_SIZE);
2056    let mut outbuf = vec![0u8; buf_size];
2057
2058    for chunk in data.chunks(buf_size) {
2059        let mut wp = 0;
2060        let mut last = 0;
2061        for pos in memchr::memchr_iter(ch, chunk) {
2062            if pos > last {
2063                let run = pos - last;
2064                outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
2065                wp += run;
2066            }
2067            last = pos + 1;
2068        }
2069        if last < chunk.len() {
2070            let run = chunk.len() - last;
2071            outbuf[wp..wp + run].copy_from_slice(&chunk[last..]);
2072            wp += run;
2073        }
2074        writer.write_all(&outbuf[..wp])?;
2075    }
2076    Ok(())
2077}
2078
2079fn delete_multi_memchr_mmap(chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2080    let c0 = chars[0];
2081    let c1 = if chars.len() >= 2 { chars[1] } else { 0 };
2082    let c2 = if chars.len() >= 3 { chars[2] } else { 0 };
2083    let is_three = chars.len() >= 3;
2084
2085    // Parallel path for large data
2086    if data.len() >= PARALLEL_THRESHOLD {
2087        let n_threads = rayon::current_num_threads().max(1);
2088        let chunk_size = (data.len() / n_threads).max(32 * 1024);
2089
2090        let results: Vec<Vec<u8>> = data
2091            .par_chunks(chunk_size)
2092            .map(|chunk| {
2093                let mut out = Vec::with_capacity(chunk.len());
2094                let mut last = 0;
2095                if is_three {
2096                    for pos in memchr::memchr3_iter(c0, c1, c2, chunk) {
2097                        if pos > last {
2098                            out.extend_from_slice(&chunk[last..pos]);
2099                        }
2100                        last = pos + 1;
2101                    }
2102                } else {
2103                    for pos in memchr::memchr2_iter(c0, c1, chunk) {
2104                        if pos > last {
2105                            out.extend_from_slice(&chunk[last..pos]);
2106                        }
2107                        last = pos + 1;
2108                    }
2109                }
2110                if last < chunk.len() {
2111                    out.extend_from_slice(&chunk[last..]);
2112                }
2113                out
2114            })
2115            .collect();
2116
2117        // Use writev to batch all results into fewer syscalls
2118        let slices: Vec<std::io::IoSlice> = results
2119            .iter()
2120            .filter(|r| !r.is_empty())
2121            .map(|r| std::io::IoSlice::new(r))
2122            .collect();
2123        return write_ioslices(writer, &slices);
2124    }
2125
2126    // Single-write fast path: collect all non-deleted spans into one buffer
2127    if data.len() <= SINGLE_WRITE_LIMIT {
2128        let mut outbuf = Vec::with_capacity(data.len());
2129        let mut last = 0;
2130        if is_three {
2131            for pos in memchr::memchr3_iter(c0, c1, c2, data) {
2132                if pos > last {
2133                    outbuf.extend_from_slice(&data[last..pos]);
2134                }
2135                last = pos + 1;
2136            }
2137        } else {
2138            for pos in memchr::memchr2_iter(c0, c1, data) {
2139                if pos > last {
2140                    outbuf.extend_from_slice(&data[last..pos]);
2141                }
2142                last = pos + 1;
2143            }
2144        }
2145        if last < data.len() {
2146            outbuf.extend_from_slice(&data[last..]);
2147        }
2148        return writer.write_all(&outbuf);
2149    }
2150
2151    // Chunked path for large data
2152    let buf_size = data.len().min(BUF_SIZE);
2153    let mut outbuf = vec![0u8; buf_size];
2154
2155    for chunk in data.chunks(buf_size) {
2156        let mut wp = 0;
2157        let mut last = 0;
2158
2159        // Iterate directly over memchr iterator without collecting into Vec<usize>.
2160        // Positions are used exactly once in order, so no intermediate allocation needed.
2161        if is_three {
2162            for pos in memchr::memchr3_iter(c0, c1, c2, chunk) {
2163                if pos > last {
2164                    let run = pos - last;
2165                    outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
2166                    wp += run;
2167                }
2168                last = pos + 1;
2169            }
2170        } else {
2171            for pos in memchr::memchr2_iter(c0, c1, chunk) {
2172                if pos > last {
2173                    let run = pos - last;
2174                    outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
2175                    wp += run;
2176                }
2177                last = pos + 1;
2178            }
2179        }
2180
2181        if last < chunk.len() {
2182            let run = chunk.len() - last;
2183            outbuf[wp..wp + run].copy_from_slice(&chunk[last..]);
2184            wp += run;
2185        }
2186        writer.write_all(&outbuf[..wp])?;
2187    }
2188    Ok(())
2189}
2190
2191/// Delete + squeeze from mmap'd byte slice.
2192///
2193/// For data <= 16MB: delete+squeeze into one buffer, one write syscall.
2194/// For data > 16MB: chunked approach to limit memory.
2195pub fn delete_squeeze_mmap(
2196    delete_chars: &[u8],
2197    squeeze_chars: &[u8],
2198    data: &[u8],
2199    writer: &mut impl Write,
2200) -> io::Result<()> {
2201    let delete_set = build_member_set(delete_chars);
2202    let squeeze_set = build_member_set(squeeze_chars);
2203
2204    // Single-write fast path: delete+squeeze all data in one pass, one write
2205    if data.len() <= SINGLE_WRITE_LIMIT {
2206        let mut outbuf: Vec<u8> = Vec::with_capacity(data.len());
2207        let mut last_squeezed: u16 = 256;
2208        unsafe {
2209            outbuf.set_len(data.len());
2210            let outp: *mut u8 = outbuf.as_mut_ptr();
2211            let inp = data.as_ptr();
2212            let len = data.len();
2213            let mut out_pos = 0;
2214            let mut i = 0;
2215            while i < len {
2216                let b = *inp.add(i);
2217                if is_member(&delete_set, b) {
2218                    i += 1;
2219                    continue;
2220                }
2221                if is_member(&squeeze_set, b) {
2222                    if last_squeezed == b as u16 {
2223                        i += 1;
2224                        continue;
2225                    }
2226                    last_squeezed = b as u16;
2227                } else {
2228                    last_squeezed = 256;
2229                }
2230                *outp.add(out_pos) = b;
2231                out_pos += 1;
2232                i += 1;
2233            }
2234            outbuf.set_len(out_pos);
2235        }
2236        return writer.write_all(&outbuf);
2237    }
2238
2239    // Chunked path for large data
2240    let buf_size = data.len().min(BUF_SIZE);
2241    let mut outbuf = vec![0u8; buf_size];
2242    let mut last_squeezed: u16 = 256;
2243
2244    for chunk in data.chunks(buf_size) {
2245        let mut out_pos = 0;
2246        for &b in chunk {
2247            if is_member(&delete_set, b) {
2248                continue;
2249            }
2250            if is_member(&squeeze_set, b) {
2251                if last_squeezed == b as u16 {
2252                    continue;
2253                }
2254                last_squeezed = b as u16;
2255            } else {
2256                last_squeezed = 256;
2257            }
2258            unsafe {
2259                *outbuf.get_unchecked_mut(out_pos) = b;
2260            }
2261            out_pos += 1;
2262        }
2263        writer.write_all(&outbuf[..out_pos])?;
2264    }
2265    Ok(())
2266}
2267
2268/// Squeeze from mmap'd byte slice.
2269///
2270/// For data >= 2MB: uses rayon parallel processing with boundary fixup.
2271/// For data <= 16MB: squeeze into one buffer, one write syscall.
2272/// For data > 16MB: chunked approach to limit memory.
2273pub fn squeeze_mmap(squeeze_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2274    if squeeze_chars.len() == 1 {
2275        return squeeze_single_mmap(squeeze_chars[0], data, writer);
2276    }
2277    if squeeze_chars.len() == 2 {
2278        return squeeze_multi_mmap::<2>(squeeze_chars, data, writer);
2279    }
2280    if squeeze_chars.len() == 3 {
2281        return squeeze_multi_mmap::<3>(squeeze_chars, data, writer);
2282    }
2283
2284    let member = build_member_set(squeeze_chars);
2285
2286    // Parallel path: squeeze each chunk independently, then fix boundaries
2287    if data.len() >= PARALLEL_THRESHOLD {
2288        let n_threads = rayon::current_num_threads().max(1);
2289        let chunk_size = (data.len() / n_threads).max(32 * 1024);
2290
2291        let results: Vec<Vec<u8>> = data
2292            .par_chunks(chunk_size)
2293            .map(|chunk| squeeze_chunk_bitset(chunk, &member))
2294            .collect();
2295
2296        // Build IoSlice list, fixing boundaries: if chunk N ends with byte B
2297        // and chunk N+1 starts with same byte B, and B is in squeeze set,
2298        // skip the first byte(s) of chunk N+1 that equal B.
2299        // Collect slices for writev to minimize syscalls.
2300        let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
2301        for (idx, result) in results.iter().enumerate() {
2302            if result.is_empty() {
2303                continue;
2304            }
2305            if idx > 0 {
2306                // Check boundary: does previous chunk end with same squeezable byte?
2307                if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
2308                    if is_member(&member, prev_last) {
2309                        // Skip leading bytes in this chunk that equal prev_last
2310                        let skip = result.iter().take_while(|&&b| b == prev_last).count();
2311                        if skip < result.len() {
2312                            slices.push(std::io::IoSlice::new(&result[skip..]));
2313                        }
2314                        continue;
2315                    }
2316                }
2317            }
2318            slices.push(std::io::IoSlice::new(result));
2319        }
2320        return write_ioslices(writer, &slices);
2321    }
2322
2323    // Single-write fast path: squeeze all data into one buffer, one write
2324    if data.len() <= SINGLE_WRITE_LIMIT {
2325        let mut outbuf: Vec<u8> = Vec::with_capacity(data.len());
2326        let mut last_squeezed: u16 = 256;
2327        let len = data.len();
2328        let mut wp = 0;
2329        let mut i = 0;
2330
2331        unsafe {
2332            outbuf.set_len(data.len());
2333            let inp = data.as_ptr();
2334            let outp: *mut u8 = outbuf.as_mut_ptr();
2335
2336            while i < len {
2337                let b = *inp.add(i);
2338                if is_member(&member, b) {
2339                    if last_squeezed != b as u16 {
2340                        *outp.add(wp) = b;
2341                        wp += 1;
2342                        last_squeezed = b as u16;
2343                    }
2344                    i += 1;
2345                    while i < len && *inp.add(i) == b {
2346                        i += 1;
2347                    }
2348                } else {
2349                    last_squeezed = 256;
2350                    *outp.add(wp) = b;
2351                    wp += 1;
2352                    i += 1;
2353                }
2354            }
2355            outbuf.set_len(wp);
2356        }
2357        return writer.write_all(&outbuf);
2358    }
2359
2360    // Chunked path for large data
2361    let buf_size = data.len().min(BUF_SIZE);
2362    let mut outbuf = vec![0u8; buf_size];
2363    let mut last_squeezed: u16 = 256;
2364
2365    for chunk in data.chunks(buf_size) {
2366        let len = chunk.len();
2367        let mut wp = 0;
2368        let mut i = 0;
2369
2370        unsafe {
2371            let inp = chunk.as_ptr();
2372            let outp = outbuf.as_mut_ptr();
2373
2374            while i < len {
2375                let b = *inp.add(i);
2376                if is_member(&member, b) {
2377                    if last_squeezed != b as u16 {
2378                        *outp.add(wp) = b;
2379                        wp += 1;
2380                        last_squeezed = b as u16;
2381                    }
2382                    i += 1;
2383                    while i < len && *inp.add(i) == b {
2384                        i += 1;
2385                    }
2386                } else {
2387                    last_squeezed = 256;
2388                    *outp.add(wp) = b;
2389                    wp += 1;
2390                    i += 1;
2391                }
2392            }
2393        }
2394        writer.write_all(&outbuf[..wp])?;
2395    }
2396    Ok(())
2397}
2398
2399/// Squeeze a single chunk using bitset membership. Returns squeezed output.
2400fn squeeze_chunk_bitset(chunk: &[u8], member: &[u8; 32]) -> Vec<u8> {
2401    let len = chunk.len();
2402    let mut out = Vec::with_capacity(len);
2403    let mut last_squeezed: u16 = 256;
2404    let mut i = 0;
2405
2406    unsafe {
2407        out.set_len(len);
2408        let inp = chunk.as_ptr();
2409        let outp: *mut u8 = out.as_mut_ptr();
2410        let mut wp = 0;
2411
2412        while i < len {
2413            let b = *inp.add(i);
2414            if is_member(member, b) {
2415                if last_squeezed != b as u16 {
2416                    *outp.add(wp) = b;
2417                    wp += 1;
2418                    last_squeezed = b as u16;
2419                }
2420                i += 1;
2421                while i < len && *inp.add(i) == b {
2422                    i += 1;
2423                }
2424            } else {
2425                last_squeezed = 256;
2426                *outp.add(wp) = b;
2427                wp += 1;
2428                i += 1;
2429            }
2430        }
2431        out.set_len(wp);
2432    }
2433    out
2434}
2435
2436fn squeeze_multi_mmap<const N: usize>(
2437    chars: &[u8],
2438    data: &[u8],
2439    writer: &mut impl Write,
2440) -> io::Result<()> {
2441    // Parallel path for large data: squeeze each chunk, fix boundaries with writev
2442    if data.len() >= PARALLEL_THRESHOLD {
2443        let member = build_member_set(chars);
2444        let n_threads = rayon::current_num_threads().max(1);
2445        let chunk_size = (data.len() / n_threads).max(32 * 1024);
2446
2447        let results: Vec<Vec<u8>> = data
2448            .par_chunks(chunk_size)
2449            .map(|chunk| squeeze_chunk_bitset(chunk, &member))
2450            .collect();
2451
2452        // Build IoSlice list, fixing boundaries
2453        let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
2454        for (idx, result) in results.iter().enumerate() {
2455            if result.is_empty() {
2456                continue;
2457            }
2458            if idx > 0 {
2459                if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
2460                    if is_member(&member, prev_last) {
2461                        let skip = result.iter().take_while(|&&b| b == prev_last).count();
2462                        if skip < result.len() {
2463                            slices.push(std::io::IoSlice::new(&result[skip..]));
2464                        }
2465                        continue;
2466                    }
2467                }
2468            }
2469            slices.push(std::io::IoSlice::new(result));
2470        }
2471        return write_ioslices(writer, &slices);
2472    }
2473
2474    let buf_size = data.len().min(BUF_SIZE);
2475    let mut outbuf = vec![0u8; buf_size];
2476    let mut wp = 0;
2477    let mut last_squeezed: u16 = 256;
2478    let mut cursor = 0;
2479
2480    macro_rules! find_next {
2481        ($data:expr) => {
2482            if N == 2 {
2483                memchr::memchr2(chars[0], chars[1], $data)
2484            } else {
2485                memchr::memchr3(chars[0], chars[1], chars[2], $data)
2486            }
2487        };
2488    }
2489
2490    macro_rules! flush_and_copy {
2491        ($src:expr, $len:expr) => {
2492            if wp + $len > buf_size {
2493                writer.write_all(&outbuf[..wp])?;
2494                wp = 0;
2495            }
2496            if $len > buf_size {
2497                writer.write_all($src)?;
2498            } else {
2499                outbuf[wp..wp + $len].copy_from_slice($src);
2500                wp += $len;
2501            }
2502        };
2503    }
2504
2505    while cursor < data.len() {
2506        match find_next!(&data[cursor..]) {
2507            Some(offset) => {
2508                let pos = cursor + offset;
2509                let b = data[pos];
2510                if pos > cursor {
2511                    let span = pos - cursor;
2512                    flush_and_copy!(&data[cursor..pos], span);
2513                    last_squeezed = 256;
2514                }
2515                if last_squeezed != b as u16 {
2516                    if wp >= buf_size {
2517                        writer.write_all(&outbuf[..wp])?;
2518                        wp = 0;
2519                    }
2520                    outbuf[wp] = b;
2521                    wp += 1;
2522                    last_squeezed = b as u16;
2523                }
2524                let mut skip = pos + 1;
2525                while skip < data.len() && data[skip] == b {
2526                    skip += 1;
2527                }
2528                cursor = skip;
2529            }
2530            None => {
2531                let remaining = data.len() - cursor;
2532                flush_and_copy!(&data[cursor..], remaining);
2533                break;
2534            }
2535        }
2536    }
2537    if wp > 0 {
2538        writer.write_all(&outbuf[..wp])?;
2539    }
2540    Ok(())
2541}
2542
2543fn squeeze_single_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2544    if data.is_empty() {
2545        return Ok(());
2546    }
2547
2548    if memchr::memmem::find(data, &[ch, ch]).is_none() {
2549        return writer.write_all(data);
2550    }
2551
2552    // Parallel path: squeeze each chunk, fix boundaries
2553    if data.len() >= PARALLEL_THRESHOLD {
2554        let n_threads = rayon::current_num_threads().max(1);
2555        let chunk_size = (data.len() / n_threads).max(32 * 1024);
2556
2557        let results: Vec<Vec<u8>> = data
2558            .par_chunks(chunk_size)
2559            .map(|chunk| {
2560                let mut out = Vec::with_capacity(chunk.len());
2561                let mut cursor = 0;
2562                while cursor < chunk.len() {
2563                    match memchr::memchr(ch, &chunk[cursor..]) {
2564                        Some(offset) => {
2565                            let pos = cursor + offset;
2566                            if pos > cursor {
2567                                out.extend_from_slice(&chunk[cursor..pos]);
2568                            }
2569                            out.push(ch);
2570                            cursor = pos + 1;
2571                            while cursor < chunk.len() && chunk[cursor] == ch {
2572                                cursor += 1;
2573                            }
2574                        }
2575                        None => {
2576                            out.extend_from_slice(&chunk[cursor..]);
2577                            break;
2578                        }
2579                    }
2580                }
2581                out
2582            })
2583            .collect();
2584
2585        // Build IoSlice list, fixing boundary squeezability.
2586        // Use writev to minimize syscalls.
2587        let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
2588        for (idx, result) in results.iter().enumerate() {
2589            if result.is_empty() {
2590                continue;
2591            }
2592            if idx > 0 {
2593                if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
2594                    if prev_last == ch {
2595                        // Skip leading ch bytes in this chunk result
2596                        let skip = result.iter().take_while(|&&b| b == ch).count();
2597                        if skip < result.len() {
2598                            slices.push(std::io::IoSlice::new(&result[skip..]));
2599                        }
2600                        continue;
2601                    }
2602                }
2603            }
2604            slices.push(std::io::IoSlice::new(result));
2605        }
2606        return write_ioslices(writer, &slices);
2607    }
2608
2609    let buf_size = data.len().min(BUF_SIZE);
2610    let mut outbuf = vec![0u8; buf_size];
2611    let len = data.len();
2612    let mut wp = 0;
2613    let mut cursor = 0;
2614
2615    while cursor < len {
2616        match memchr::memchr(ch, &data[cursor..]) {
2617            Some(offset) => {
2618                let pos = cursor + offset;
2619                let gap = pos - cursor;
2620                if gap > 0 {
2621                    if wp + gap > buf_size {
2622                        writer.write_all(&outbuf[..wp])?;
2623                        wp = 0;
2624                    }
2625                    if gap > buf_size {
2626                        writer.write_all(&data[cursor..pos])?;
2627                    } else {
2628                        outbuf[wp..wp + gap].copy_from_slice(&data[cursor..pos]);
2629                        wp += gap;
2630                    }
2631                }
2632                if wp >= buf_size {
2633                    writer.write_all(&outbuf[..wp])?;
2634                    wp = 0;
2635                }
2636                outbuf[wp] = ch;
2637                wp += 1;
2638                cursor = pos + 1;
2639                while cursor < len && data[cursor] == ch {
2640                    cursor += 1;
2641                }
2642            }
2643            None => {
2644                let remaining = len - cursor;
2645                if remaining > 0 {
2646                    if wp + remaining > buf_size {
2647                        writer.write_all(&outbuf[..wp])?;
2648                        wp = 0;
2649                    }
2650                    if remaining > buf_size {
2651                        writer.write_all(&data[cursor..])?;
2652                    } else {
2653                        outbuf[wp..wp + remaining].copy_from_slice(&data[cursor..]);
2654                        wp += remaining;
2655                    }
2656                }
2657                break;
2658            }
2659        }
2660    }
2661
2662    if wp > 0 {
2663        writer.write_all(&outbuf[..wp])?;
2664    }
2665    Ok(())
2666}