Skip to main content

coreutils_rs/tr/
core.rs

1use std::io::{self, Read, Write};
2
3use rayon::prelude::*;
4
5/// Maximum IoSlice entries per write_vectored batch.
6/// Linux UIO_MAXIOV is 1024; we use that as our batch limit.
7const MAX_IOV: usize = 1024;
8
9/// Main processing buffer: 4MB (fits in L3 cache, avoids cache thrashing).
10const BUF_SIZE: usize = 4 * 1024 * 1024;
11
12/// Stream buffer: 1MB — fits in L2/L3 cache so data stays hot between read,
13/// process, and write phases. Larger buffers (4-8MB) cause cache thrashing.
14const STREAM_BUF: usize = 1024 * 1024;
15
16/// Minimum data size to engage rayon parallel processing for mmap paths.
17/// Below this, single-threaded is faster due to thread pool overhead.
18const PARALLEL_THRESHOLD: usize = 2 * 1024 * 1024;
19
20/// Write multiple IoSlice buffers using write_vectored, batching into MAX_IOV-sized groups.
21/// Falls back to write_all per slice for partial writes.
22#[inline]
23fn write_ioslices(writer: &mut impl Write, slices: &[std::io::IoSlice]) -> io::Result<()> {
24    if slices.is_empty() {
25        return Ok(());
26    }
27    for batch in slices.chunks(MAX_IOV) {
28        let total: usize = batch.iter().map(|s| s.len()).sum();
29        match writer.write_vectored(batch) {
30            Ok(n) if n >= total => continue,
31            Ok(mut written) => {
32                // Partial write: fall back to write_all per remaining slice
33                for slice in batch {
34                    let slen = slice.len();
35                    if written >= slen {
36                        written -= slen;
37                        continue;
38                    }
39                    if written > 0 {
40                        writer.write_all(&slice[written..])?;
41                        written = 0;
42                    } else {
43                        writer.write_all(slice)?;
44                    }
45                }
46            }
47            Err(e) => return Err(e),
48        }
49    }
50    Ok(())
51}
52
53/// Allocate a Vec<u8> of given length without zero-initialization.
54/// SAFETY: Caller must write all bytes before reading them.
55#[inline]
56#[allow(clippy::uninit_vec)]
57fn alloc_uninit_vec(len: usize) -> Vec<u8> {
58    let mut v = Vec::with_capacity(len);
59    // SAFETY: u8 has no drop, no invalid bit patterns; caller will overwrite before reading
60    unsafe {
61        v.set_len(len);
62    }
63    v
64}
65
66/// Build a 256-byte lookup table mapping set1[i] -> set2[i].
67#[inline]
68fn build_translate_table(set1: &[u8], set2: &[u8]) -> [u8; 256] {
69    let mut table: [u8; 256] = std::array::from_fn(|i| i as u8);
70    let last = set2.last().copied();
71    for (i, &from) in set1.iter().enumerate() {
72        table[from as usize] = if i < set2.len() {
73            set2[i]
74        } else {
75            last.unwrap_or(from)
76        };
77    }
78    table
79}
80
81/// Build a 256-bit (32-byte) membership set for O(1) byte lookup.
82#[inline]
83fn build_member_set(chars: &[u8]) -> [u8; 32] {
84    let mut set = [0u8; 32];
85    for &ch in chars {
86        set[ch as usize >> 3] |= 1 << (ch & 7);
87    }
88    set
89}
90
91#[inline(always)]
92fn is_member(set: &[u8; 32], ch: u8) -> bool {
93    unsafe { (*set.get_unchecked(ch as usize >> 3) & (1 << (ch & 7))) != 0 }
94}
95
96/// Translate bytes in-place using a 256-byte lookup table.
97#[inline(always)]
98fn translate_inplace(data: &mut [u8], table: &[u8; 256]) {
99    for b in data.iter_mut() {
100        *b = unsafe { *table.get_unchecked(*b as usize) };
101    }
102}
103
104/// Translate bytes from source to destination using a 256-byte lookup table.
105#[inline(always)]
106fn translate_to(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
107    debug_assert!(dst.len() >= src.len());
108    unsafe {
109        let sp = src.as_ptr();
110        let dp = dst.as_mut_ptr();
111        let len = src.len();
112        let mut i = 0;
113        while i + 8 <= len {
114            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
115            *dp.add(i + 1) = *table.get_unchecked(*sp.add(i + 1) as usize);
116            *dp.add(i + 2) = *table.get_unchecked(*sp.add(i + 2) as usize);
117            *dp.add(i + 3) = *table.get_unchecked(*sp.add(i + 3) as usize);
118            *dp.add(i + 4) = *table.get_unchecked(*sp.add(i + 4) as usize);
119            *dp.add(i + 5) = *table.get_unchecked(*sp.add(i + 5) as usize);
120            *dp.add(i + 6) = *table.get_unchecked(*sp.add(i + 6) as usize);
121            *dp.add(i + 7) = *table.get_unchecked(*sp.add(i + 7) as usize);
122            i += 8;
123        }
124        while i < len {
125            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
126            i += 1;
127        }
128    }
129}
130
131// ============================================================================
132// SIMD range translation (x86_64)
133// ============================================================================
134
135/// Detect if the translate table is a single contiguous range with constant offset.
136/// Returns Some((lo, hi, offset)) if all non-identity entries form [lo..=hi] with
137/// table[i] = i + offset for all i in [lo, hi].
138#[inline]
139fn detect_range_offset(table: &[u8; 256]) -> Option<(u8, u8, i8)> {
140    let mut lo: Option<u8> = None;
141    let mut hi = 0u8;
142    let mut offset = 0i16;
143
144    for i in 0..256 {
145        if table[i] != i as u8 {
146            let diff = table[i] as i16 - i as i16;
147            match lo {
148                None => {
149                    lo = Some(i as u8);
150                    hi = i as u8;
151                    offset = diff;
152                }
153                Some(_) => {
154                    if diff != offset || i as u8 != hi.wrapping_add(1) {
155                        return None;
156                    }
157                    hi = i as u8;
158                }
159            }
160        }
161    }
162
163    lo.map(|l| (l, hi, offset as i8))
164}
165
166/// SIMD-accelerated range translation for mmap'd data.
167/// For tables where only a contiguous range [lo..=hi] is translated by a constant offset,
168/// uses AVX2 (32 bytes/iter) or SSE2 (16 bytes/iter) vectorized arithmetic.
169#[cfg(target_arch = "x86_64")]
170fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
171    if is_x86_feature_detected!("avx2") {
172        unsafe { translate_range_avx2(src, dst, lo, hi, offset) };
173    } else {
174        unsafe { translate_range_sse2(src, dst, lo, hi, offset) };
175    }
176}
177
178#[cfg(target_arch = "x86_64")]
179#[target_feature(enable = "avx2")]
180unsafe fn translate_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
181    use std::arch::x86_64::*;
182
183    unsafe {
184        let range = hi - lo;
185        // Bias: shift range so lo maps to -128 (signed min).
186        // For input in [lo, hi]: biased = input + (0x80 - lo) is in [-128, -128+range].
187        // For input < lo: biased wraps to large positive (signed), > threshold.
188        // For input > hi: biased > -128+range, > threshold.
189        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
190        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
191        let offset_v = _mm256_set1_epi8(offset);
192        let zero = _mm256_setzero_si256();
193
194        let len = src.len();
195        let mut i = 0;
196
197        while i + 32 <= len {
198            let input = _mm256_loadu_si256(src.as_ptr().add(i) as *const _);
199            let biased = _mm256_add_epi8(input, bias_v);
200            // gt = 0xFF where biased > threshold (OUT of range)
201            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
202            // mask = 0xFF where IN range (NOT gt)
203            let mask = _mm256_cmpeq_epi8(gt, zero);
204            let offset_masked = _mm256_and_si256(mask, offset_v);
205            let result = _mm256_add_epi8(input, offset_masked);
206            _mm256_storeu_si256(dst.as_mut_ptr().add(i) as *mut _, result);
207            i += 32;
208        }
209
210        // SSE2 tail for 16-byte remainder
211        if i + 16 <= len {
212            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
213            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
214            let offset_v128 = _mm_set1_epi8(offset);
215            let zero128 = _mm_setzero_si128();
216
217            let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
218            let biased = _mm_add_epi8(input, bias_v128);
219            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
220            let mask = _mm_cmpeq_epi8(gt, zero128);
221            let offset_masked = _mm_and_si128(mask, offset_v128);
222            let result = _mm_add_epi8(input, offset_masked);
223            _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
224            i += 16;
225        }
226
227        // Scalar tail
228        while i < len {
229            let b = *src.get_unchecked(i);
230            *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
231                b.wrapping_add(offset as u8)
232            } else {
233                b
234            };
235            i += 1;
236        }
237    }
238}
239
240#[cfg(target_arch = "x86_64")]
241#[target_feature(enable = "sse2")]
242unsafe fn translate_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
243    use std::arch::x86_64::*;
244
245    unsafe {
246        let range = hi - lo;
247        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
248        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
249        let offset_v = _mm_set1_epi8(offset);
250        let zero = _mm_setzero_si128();
251
252        let len = src.len();
253        let mut i = 0;
254
255        while i + 16 <= len {
256            let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
257            let biased = _mm_add_epi8(input, bias_v);
258            let gt = _mm_cmpgt_epi8(biased, threshold_v);
259            let mask = _mm_cmpeq_epi8(gt, zero);
260            let offset_masked = _mm_and_si128(mask, offset_v);
261            let result = _mm_add_epi8(input, offset_masked);
262            _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
263            i += 16;
264        }
265
266        while i < len {
267            let b = *src.get_unchecked(i);
268            *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
269                b.wrapping_add(offset as u8)
270            } else {
271                b
272            };
273            i += 1;
274        }
275    }
276}
277
278/// Scalar range translation fallback for non-x86_64.
279#[cfg(not(target_arch = "x86_64"))]
280fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
281    for (i, &b) in src.iter().enumerate() {
282        dst[i] = if b >= lo && b <= hi {
283            b.wrapping_add(offset as u8)
284        } else {
285            b
286        };
287    }
288}
289
290// ============================================================================
291// In-place SIMD range translation (saves one buffer allocation in streaming)
292// ============================================================================
293
294/// In-place SIMD-accelerated range translation.
295/// Translates bytes in [lo..=hi] by adding `offset`, leaving others unchanged.
296/// Operates on the buffer in-place, eliminating the need for a separate output buffer.
297#[cfg(target_arch = "x86_64")]
298fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
299    if is_x86_feature_detected!("avx2") {
300        unsafe { translate_range_avx2_inplace(data, lo, hi, offset) };
301    } else {
302        unsafe { translate_range_sse2_inplace(data, lo, hi, offset) };
303    }
304}
305
306#[cfg(target_arch = "x86_64")]
307#[target_feature(enable = "avx2")]
308unsafe fn translate_range_avx2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
309    use std::arch::x86_64::*;
310
311    unsafe {
312        let range = hi - lo;
313        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
314        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
315        let offset_v = _mm256_set1_epi8(offset);
316        let zero = _mm256_setzero_si256();
317
318        let len = data.len();
319        let ptr = data.as_mut_ptr();
320        let mut i = 0;
321
322        while i + 32 <= len {
323            let input = _mm256_loadu_si256(ptr.add(i) as *const _);
324            let biased = _mm256_add_epi8(input, bias_v);
325            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
326            let mask = _mm256_cmpeq_epi8(gt, zero);
327            let offset_masked = _mm256_and_si256(mask, offset_v);
328            let result = _mm256_add_epi8(input, offset_masked);
329            _mm256_storeu_si256(ptr.add(i) as *mut _, result);
330            i += 32;
331        }
332
333        if i + 16 <= len {
334            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
335            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
336            let offset_v128 = _mm_set1_epi8(offset);
337            let zero128 = _mm_setzero_si128();
338
339            let input = _mm_loadu_si128(ptr.add(i) as *const _);
340            let biased = _mm_add_epi8(input, bias_v128);
341            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
342            let mask = _mm_cmpeq_epi8(gt, zero128);
343            let offset_masked = _mm_and_si128(mask, offset_v128);
344            let result = _mm_add_epi8(input, offset_masked);
345            _mm_storeu_si128(ptr.add(i) as *mut _, result);
346            i += 16;
347        }
348
349        while i < len {
350            let b = *ptr.add(i);
351            *ptr.add(i) = if b >= lo && b <= hi {
352                b.wrapping_add(offset as u8)
353            } else {
354                b
355            };
356            i += 1;
357        }
358    }
359}
360
361#[cfg(target_arch = "x86_64")]
362#[target_feature(enable = "sse2")]
363unsafe fn translate_range_sse2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
364    use std::arch::x86_64::*;
365
366    unsafe {
367        let range = hi - lo;
368        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
369        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
370        let offset_v = _mm_set1_epi8(offset);
371        let zero = _mm_setzero_si128();
372
373        let len = data.len();
374        let ptr = data.as_mut_ptr();
375        let mut i = 0;
376
377        while i + 16 <= len {
378            let input = _mm_loadu_si128(ptr.add(i) as *const _);
379            let biased = _mm_add_epi8(input, bias_v);
380            let gt = _mm_cmpgt_epi8(biased, threshold_v);
381            let mask = _mm_cmpeq_epi8(gt, zero);
382            let offset_masked = _mm_and_si128(mask, offset_v);
383            let result = _mm_add_epi8(input, offset_masked);
384            _mm_storeu_si128(ptr.add(i) as *mut _, result);
385            i += 16;
386        }
387
388        while i < len {
389            let b = *ptr.add(i);
390            *ptr.add(i) = if b >= lo && b <= hi {
391                b.wrapping_add(offset as u8)
392            } else {
393                b
394            };
395            i += 1;
396        }
397    }
398}
399
400#[cfg(not(target_arch = "x86_64"))]
401fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
402    for b in data.iter_mut() {
403        if *b >= lo && *b <= hi {
404            *b = b.wrapping_add(offset as u8);
405        }
406    }
407}
408
409// ============================================================================
410// SIMD range deletion (x86_64)
411// ============================================================================
412
413/// Detect if ALL delete characters form a single contiguous byte range [lo..=hi].
414/// Returns Some((lo, hi)) if so. This is true for common classes:
415/// - `[:digit:]` = 0x30..=0x39
416/// - `a-z` = 0x61..=0x7A
417/// - `A-Z` = 0x41..=0x5A
418#[inline]
419fn detect_delete_range(chars: &[u8]) -> Option<(u8, u8)> {
420    if chars.is_empty() {
421        return None;
422    }
423    let mut lo = chars[0];
424    let mut hi = chars[0];
425    for &c in &chars[1..] {
426        if c < lo {
427            lo = c;
428        }
429        if c > hi {
430            hi = c;
431        }
432    }
433    // Check that the range size matches the number of chars (no gaps)
434    if (hi - lo + 1) as usize == chars.len() {
435        Some((lo, hi))
436    } else {
437        None
438    }
439}
440
441/// SIMD-accelerated delete for contiguous byte ranges.
442/// Uses the same bias+threshold trick as range translate to identify bytes in [lo..=hi],
443/// then compacts output by skipping matched bytes.
444#[cfg(target_arch = "x86_64")]
445fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
446    if is_x86_feature_detected!("avx2") {
447        unsafe { delete_range_avx2(src, dst, lo, hi) }
448    } else {
449        unsafe { delete_range_sse2(src, dst, lo, hi) }
450    }
451}
452
453#[cfg(target_arch = "x86_64")]
454#[target_feature(enable = "avx2")]
455unsafe fn delete_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
456    use std::arch::x86_64::*;
457
458    unsafe {
459        let range = hi - lo;
460        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
461        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
462        let zero = _mm256_setzero_si256();
463
464        let len = src.len();
465        let sp = src.as_ptr();
466        let dp = dst.as_mut_ptr();
467        let mut ri = 0;
468        let mut wp = 0;
469
470        while ri + 32 <= len {
471            let input = _mm256_loadu_si256(sp.add(ri) as *const _);
472            let biased = _mm256_add_epi8(input, bias_v);
473            // gt = 0xFF where biased > threshold (OUT of range = KEEP)
474            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
475            // in_range = 0xFF where IN range (to DELETE), 0 where to KEEP
476            let in_range = _mm256_cmpeq_epi8(gt, zero);
477            // keep_mask bits: 1 = keep (NOT in range)
478            let keep_mask = !(_mm256_movemask_epi8(in_range) as u32);
479
480            // Compact: copy kept bytes using the mask
481            let mut mask = keep_mask;
482            while mask != 0 {
483                let bit = mask.trailing_zeros() as usize;
484                *dp.add(wp) = *sp.add(ri + bit);
485                wp += 1;
486                mask &= mask - 1; // clear lowest set bit
487            }
488            ri += 32;
489        }
490
491        // SSE2 tail for 16-byte remainder
492        if ri + 16 <= len {
493            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
494            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
495            let zero128 = _mm_setzero_si128();
496
497            let input = _mm_loadu_si128(sp.add(ri) as *const _);
498            let biased = _mm_add_epi8(input, bias_v128);
499            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
500            let in_range = _mm_cmpeq_epi8(gt, zero128);
501            let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
502
503            let mut mask = keep_mask;
504            while mask != 0 {
505                let bit = mask.trailing_zeros() as usize;
506                *dp.add(wp) = *sp.add(ri + bit);
507                wp += 1;
508                mask &= mask - 1;
509            }
510            ri += 16;
511        }
512
513        // Scalar tail
514        while ri < len {
515            let b = *sp.add(ri);
516            if b < lo || b > hi {
517                *dp.add(wp) = b;
518                wp += 1;
519            }
520            ri += 1;
521        }
522
523        wp
524    }
525}
526
527#[cfg(target_arch = "x86_64")]
528#[target_feature(enable = "sse2")]
529unsafe fn delete_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
530    use std::arch::x86_64::*;
531
532    unsafe {
533        let range = hi - lo;
534        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
535        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
536        let zero = _mm_setzero_si128();
537
538        let len = src.len();
539        let sp = src.as_ptr();
540        let dp = dst.as_mut_ptr();
541        let mut ri = 0;
542        let mut wp = 0;
543
544        while ri + 16 <= len {
545            let input = _mm_loadu_si128(sp.add(ri) as *const _);
546            let biased = _mm_add_epi8(input, bias_v);
547            let gt = _mm_cmpgt_epi8(biased, threshold_v);
548            let in_range = _mm_cmpeq_epi8(gt, zero);
549            let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
550
551            let mut mask = keep_mask;
552            while mask != 0 {
553                let bit = mask.trailing_zeros() as usize;
554                *dp.add(wp) = *sp.add(ri + bit);
555                wp += 1;
556                mask &= mask - 1;
557            }
558            ri += 16;
559        }
560
561        while ri < len {
562            let b = *sp.add(ri);
563            if b < lo || b > hi {
564                *dp.add(wp) = b;
565                wp += 1;
566            }
567            ri += 1;
568        }
569
570        wp
571    }
572}
573
574/// Scalar range delete fallback for non-x86_64.
575#[cfg(not(target_arch = "x86_64"))]
576fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
577    let mut wp = 0;
578    for &b in src {
579        if b < lo || b > hi {
580            dst[wp] = b;
581            wp += 1;
582        }
583    }
584    wp
585}
586
587/// Streaming delete for contiguous byte ranges using SIMD range detection.
588fn delete_range_streaming(
589    lo: u8,
590    hi: u8,
591    reader: &mut impl Read,
592    writer: &mut impl Write,
593) -> io::Result<()> {
594    let mut src = vec![0u8; STREAM_BUF];
595    let mut dst = alloc_uninit_vec(STREAM_BUF);
596    loop {
597        let n = read_full(reader, &mut src)?;
598        if n == 0 {
599            break;
600        }
601        let wp = delete_range_chunk(&src[..n], &mut dst, lo, hi);
602        if wp > 0 {
603            writer.write_all(&dst[..wp])?;
604        }
605    }
606    Ok(())
607}
608
609// ============================================================================
610// Streaming functions (Read + Write)
611// ============================================================================
612
613pub fn translate(
614    set1: &[u8],
615    set2: &[u8],
616    reader: &mut impl Read,
617    writer: &mut impl Write,
618) -> io::Result<()> {
619    let table = build_translate_table(set1, set2);
620
621    // Try SIMD fast path for range translations (in-place, single buffer)
622    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
623        return translate_range_stream(lo, hi, offset, reader, writer);
624    }
625
626    // General case: use separate src/dst buffers with 8x-unrolled translate_to.
627    // This avoids the read-modify-write cache penalty of in-place translation:
628    // reading and writing the same cache line forces store-to-load forwarding stalls.
629    // With separate buffers, the CPU can pipeline reads from src while writing to dst.
630    let mut src = vec![0u8; STREAM_BUF];
631    let mut dst = alloc_uninit_vec(STREAM_BUF);
632    loop {
633        let n = read_full(reader, &mut src)?;
634        if n == 0 {
635            break;
636        }
637        translate_to(&src[..n], &mut dst[..n], &table);
638        writer.write_all(&dst[..n])?;
639    }
640    Ok(())
641}
642
643/// Streaming SIMD range translation — single buffer, in-place transform.
644/// Saves 16MB allocation + memcpy vs separate src/dst buffers.
645fn translate_range_stream(
646    lo: u8,
647    hi: u8,
648    offset: i8,
649    reader: &mut impl Read,
650    writer: &mut impl Write,
651) -> io::Result<()> {
652    let mut buf = vec![0u8; STREAM_BUF];
653    loop {
654        let n = read_full(reader, &mut buf)?;
655        if n == 0 {
656            break;
657        }
658        translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
659        writer.write_all(&buf[..n])?;
660    }
661    Ok(())
662}
663
664/// Read as many bytes as possible into buf, retrying on partial reads.
665#[inline]
666fn read_full(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
667    let mut total = 0;
668    while total < buf.len() {
669        match reader.read(&mut buf[total..]) {
670            Ok(0) => break,
671            Ok(n) => total += n,
672            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
673            Err(e) => return Err(e),
674        }
675    }
676    Ok(total)
677}
678
679pub fn translate_squeeze(
680    set1: &[u8],
681    set2: &[u8],
682    reader: &mut impl Read,
683    writer: &mut impl Write,
684) -> io::Result<()> {
685    let table = build_translate_table(set1, set2);
686    let squeeze_set = build_member_set(set2);
687
688    // Two-pass optimization for range translations:
689    // Pass 1: SIMD range translate in-place (10x faster than scalar table lookup)
690    // Pass 2: scalar squeeze (inherently sequential due to state dependency)
691    // Even though it's two passes, the translate pass is so much faster with SIMD
692    // that the total is still a net win.
693    let range_info = detect_range_offset(&table);
694
695    let mut buf = vec![0u8; STREAM_BUF];
696    let mut last_squeezed: u16 = 256;
697
698    loop {
699        let n = read_full(reader, &mut buf)?;
700        if n == 0 {
701            break;
702        }
703        // Pass 1: translate
704        if let Some((lo, hi, offset)) = range_info {
705            translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
706        } else {
707            translate_inplace(&mut buf[..n], &table);
708        }
709        // Pass 2: squeeze in-place
710        let mut wp = 0;
711        unsafe {
712            let ptr = buf.as_mut_ptr();
713            for i in 0..n {
714                let b = *ptr.add(i);
715                if is_member(&squeeze_set, b) {
716                    if last_squeezed == b as u16 {
717                        continue;
718                    }
719                    last_squeezed = b as u16;
720                } else {
721                    last_squeezed = 256;
722                }
723                *ptr.add(wp) = b;
724                wp += 1;
725            }
726        }
727        writer.write_all(&buf[..wp])?;
728    }
729    Ok(())
730}
731
732pub fn delete(
733    delete_chars: &[u8],
734    reader: &mut impl Read,
735    writer: &mut impl Write,
736) -> io::Result<()> {
737    if delete_chars.len() == 1 {
738        return delete_single_streaming(delete_chars[0], reader, writer);
739    }
740    if delete_chars.len() <= 3 {
741        return delete_multi_streaming(delete_chars, reader, writer);
742    }
743
744    // SIMD fast path: if all delete chars form a contiguous range [lo..=hi],
745    // use vectorized range comparison instead of scalar bitset lookup.
746    // This covers [:digit:] (0x30-0x39), a-z, A-Z, etc.
747    if let Some((lo, hi)) = detect_delete_range(delete_chars) {
748        return delete_range_streaming(lo, hi, reader, writer);
749    }
750
751    let member = build_member_set(delete_chars);
752    let mut buf = vec![0u8; STREAM_BUF];
753
754    loop {
755        let n = read_full(reader, &mut buf)?;
756        if n == 0 {
757            break;
758        }
759        let mut wp = 0;
760        unsafe {
761            let ptr = buf.as_mut_ptr();
762            let mut i = 0;
763            while i + 8 <= n {
764                let b0 = *ptr.add(i);
765                let b1 = *ptr.add(i + 1);
766                let b2 = *ptr.add(i + 2);
767                let b3 = *ptr.add(i + 3);
768                let b4 = *ptr.add(i + 4);
769                let b5 = *ptr.add(i + 5);
770                let b6 = *ptr.add(i + 6);
771                let b7 = *ptr.add(i + 7);
772
773                if !is_member(&member, b0) {
774                    *ptr.add(wp) = b0;
775                    wp += 1;
776                }
777                if !is_member(&member, b1) {
778                    *ptr.add(wp) = b1;
779                    wp += 1;
780                }
781                if !is_member(&member, b2) {
782                    *ptr.add(wp) = b2;
783                    wp += 1;
784                }
785                if !is_member(&member, b3) {
786                    *ptr.add(wp) = b3;
787                    wp += 1;
788                }
789                if !is_member(&member, b4) {
790                    *ptr.add(wp) = b4;
791                    wp += 1;
792                }
793                if !is_member(&member, b5) {
794                    *ptr.add(wp) = b5;
795                    wp += 1;
796                }
797                if !is_member(&member, b6) {
798                    *ptr.add(wp) = b6;
799                    wp += 1;
800                }
801                if !is_member(&member, b7) {
802                    *ptr.add(wp) = b7;
803                    wp += 1;
804                }
805                i += 8;
806            }
807            while i < n {
808                let b = *ptr.add(i);
809                if !is_member(&member, b) {
810                    *ptr.add(wp) = b;
811                    wp += 1;
812                }
813                i += 1;
814            }
815        }
816        writer.write_all(&buf[..wp])?;
817    }
818    Ok(())
819}
820
821fn delete_single_streaming(
822    ch: u8,
823    reader: &mut impl Read,
824    writer: &mut impl Write,
825) -> io::Result<()> {
826    let mut buf = vec![0u8; STREAM_BUF];
827    loop {
828        let n = match reader.read(&mut buf) {
829            Ok(0) => break,
830            Ok(n) => n,
831            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
832            Err(e) => return Err(e),
833        };
834        let mut wp = 0;
835        let mut i = 0;
836        while i < n {
837            match memchr::memchr(ch, &buf[i..n]) {
838                Some(offset) => {
839                    if offset > 0 {
840                        if wp != i {
841                            unsafe {
842                                std::ptr::copy(
843                                    buf.as_ptr().add(i),
844                                    buf.as_mut_ptr().add(wp),
845                                    offset,
846                                );
847                            }
848                        }
849                        wp += offset;
850                    }
851                    i += offset + 1;
852                }
853                None => {
854                    let run_len = n - i;
855                    if run_len > 0 {
856                        if wp != i {
857                            unsafe {
858                                std::ptr::copy(
859                                    buf.as_ptr().add(i),
860                                    buf.as_mut_ptr().add(wp),
861                                    run_len,
862                                );
863                            }
864                        }
865                        wp += run_len;
866                    }
867                    break;
868                }
869            }
870        }
871        writer.write_all(&buf[..wp])?;
872    }
873    Ok(())
874}
875
876fn delete_multi_streaming(
877    chars: &[u8],
878    reader: &mut impl Read,
879    writer: &mut impl Write,
880) -> io::Result<()> {
881    let mut buf = vec![0u8; STREAM_BUF];
882    loop {
883        let n = match reader.read(&mut buf) {
884            Ok(0) => break,
885            Ok(n) => n,
886            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
887            Err(e) => return Err(e),
888        };
889        let mut wp = 0;
890        let mut i = 0;
891        while i < n {
892            let found = if chars.len() == 2 {
893                memchr::memchr2(chars[0], chars[1], &buf[i..n])
894            } else {
895                memchr::memchr3(chars[0], chars[1], chars[2], &buf[i..n])
896            };
897            match found {
898                Some(offset) => {
899                    if offset > 0 {
900                        if wp != i {
901                            unsafe {
902                                std::ptr::copy(
903                                    buf.as_ptr().add(i),
904                                    buf.as_mut_ptr().add(wp),
905                                    offset,
906                                );
907                            }
908                        }
909                        wp += offset;
910                    }
911                    i += offset + 1;
912                }
913                None => {
914                    let run_len = n - i;
915                    if run_len > 0 {
916                        if wp != i {
917                            unsafe {
918                                std::ptr::copy(
919                                    buf.as_ptr().add(i),
920                                    buf.as_mut_ptr().add(wp),
921                                    run_len,
922                                );
923                            }
924                        }
925                        wp += run_len;
926                    }
927                    break;
928                }
929            }
930        }
931        writer.write_all(&buf[..wp])?;
932    }
933    Ok(())
934}
935
936pub fn delete_squeeze(
937    delete_chars: &[u8],
938    squeeze_chars: &[u8],
939    reader: &mut impl Read,
940    writer: &mut impl Write,
941) -> io::Result<()> {
942    let delete_set = build_member_set(delete_chars);
943    let squeeze_set = build_member_set(squeeze_chars);
944    let mut buf = vec![0u8; STREAM_BUF];
945    let mut last_squeezed: u16 = 256;
946
947    loop {
948        let n = read_full(reader, &mut buf)?;
949        if n == 0 {
950            break;
951        }
952        let mut wp = 0;
953        unsafe {
954            let ptr = buf.as_mut_ptr();
955            for i in 0..n {
956                let b = *ptr.add(i);
957                if is_member(&delete_set, b) {
958                    continue;
959                }
960                if is_member(&squeeze_set, b) {
961                    if last_squeezed == b as u16 {
962                        continue;
963                    }
964                    last_squeezed = b as u16;
965                } else {
966                    last_squeezed = 256;
967                }
968                *ptr.add(wp) = b;
969                wp += 1;
970            }
971        }
972        writer.write_all(&buf[..wp])?;
973    }
974    Ok(())
975}
976
977pub fn squeeze(
978    squeeze_chars: &[u8],
979    reader: &mut impl Read,
980    writer: &mut impl Write,
981) -> io::Result<()> {
982    if squeeze_chars.len() == 1 {
983        return squeeze_single_stream(squeeze_chars[0], reader, writer);
984    }
985
986    let member = build_member_set(squeeze_chars);
987    let mut buf = vec![0u8; STREAM_BUF];
988    let mut last_squeezed: u16 = 256;
989
990    loop {
991        let n = read_full(reader, &mut buf)?;
992        if n == 0 {
993            break;
994        }
995        let mut wp = 0;
996        unsafe {
997            let ptr = buf.as_mut_ptr();
998            for i in 0..n {
999                let b = *ptr.add(i);
1000                if is_member(&member, b) {
1001                    if last_squeezed == b as u16 {
1002                        continue;
1003                    }
1004                    last_squeezed = b as u16;
1005                } else {
1006                    last_squeezed = 256;
1007                }
1008                *ptr.add(wp) = b;
1009                wp += 1;
1010            }
1011        }
1012        writer.write_all(&buf[..wp])?;
1013    }
1014    Ok(())
1015}
1016
1017fn squeeze_single_stream(
1018    ch: u8,
1019    reader: &mut impl Read,
1020    writer: &mut impl Write,
1021) -> io::Result<()> {
1022    let mut buf = vec![0u8; STREAM_BUF];
1023    let mut was_squeeze_char = false;
1024
1025    loop {
1026        let n = match reader.read(&mut buf) {
1027            Ok(0) => break,
1028            Ok(n) => n,
1029            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
1030            Err(e) => return Err(e),
1031        };
1032
1033        let mut wp = 0;
1034        let mut i = 0;
1035
1036        while i < n {
1037            if was_squeeze_char && buf[i] == ch {
1038                i += 1;
1039                while i < n && buf[i] == ch {
1040                    i += 1;
1041                }
1042                if i >= n {
1043                    break;
1044                }
1045            }
1046
1047            match memchr::memchr(ch, &buf[i..n]) {
1048                Some(offset) => {
1049                    let run_len = offset;
1050                    if run_len > 0 {
1051                        if wp != i {
1052                            unsafe {
1053                                std::ptr::copy(
1054                                    buf.as_ptr().add(i),
1055                                    buf.as_mut_ptr().add(wp),
1056                                    run_len,
1057                                );
1058                            }
1059                        }
1060                        wp += run_len;
1061                    }
1062                    i += run_len;
1063
1064                    unsafe {
1065                        *buf.as_mut_ptr().add(wp) = ch;
1066                    }
1067                    wp += 1;
1068                    was_squeeze_char = true;
1069                    i += 1;
1070                    while i < n && buf[i] == ch {
1071                        i += 1;
1072                    }
1073                }
1074                None => {
1075                    let run_len = n - i;
1076                    if run_len > 0 {
1077                        if wp != i {
1078                            unsafe {
1079                                std::ptr::copy(
1080                                    buf.as_ptr().add(i),
1081                                    buf.as_mut_ptr().add(wp),
1082                                    run_len,
1083                                );
1084                            }
1085                        }
1086                        wp += run_len;
1087                    }
1088                    was_squeeze_char = false;
1089                    break;
1090                }
1091            }
1092        }
1093
1094        writer.write_all(&buf[..wp])?;
1095    }
1096    Ok(())
1097}
1098
1099// ============================================================================
1100// Mmap-based functions (zero-copy input from byte slice)
1101// ============================================================================
1102
1103/// Maximum data size for single-allocation translate approach.
1104/// Below this limit, translate ALL data into one buffer and do a single write_all.
1105/// Above this, use chunked approach to limit memory usage.
1106const SINGLE_WRITE_LIMIT: usize = 16 * 1024 * 1024;
1107
1108/// Translate bytes from an mmap'd byte slice.
1109/// Detects single-range translations (e.g., a-z to A-Z) and uses SIMD vectorized
1110/// arithmetic (AVX2: 32 bytes/iter, SSE2: 16 bytes/iter) for those cases.
1111/// Falls back to scalar 256-byte table lookup for general translations.
1112///
1113/// For data >= 2MB: uses rayon parallel processing across multiple cores.
1114/// For data <= 16MB: single allocation + single write_all (1 syscall).
1115/// For data > 16MB: chunked approach to limit memory (N syscalls where N = data/4MB).
1116pub fn translate_mmap(
1117    set1: &[u8],
1118    set2: &[u8],
1119    data: &[u8],
1120    writer: &mut impl Write,
1121) -> io::Result<()> {
1122    let table = build_translate_table(set1, set2);
1123
1124    // Check if table is identity — pure passthrough
1125    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
1126    if is_identity {
1127        return writer.write_all(data);
1128    }
1129
1130    // Try SIMD fast path for single-range constant-offset translations
1131    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
1132        return translate_mmap_range(data, writer, lo, hi, offset);
1133    }
1134
1135    // General case: table lookup (with parallel processing for large data)
1136    translate_mmap_table(data, writer, &table)
1137}
1138
1139/// SIMD range translate for mmap data, with rayon parallel processing.
1140fn translate_mmap_range(
1141    data: &[u8],
1142    writer: &mut impl Write,
1143    lo: u8,
1144    hi: u8,
1145    offset: i8,
1146) -> io::Result<()> {
1147    // Parallel path: split data into chunks, translate each in parallel
1148    if data.len() >= PARALLEL_THRESHOLD {
1149        let mut buf = alloc_uninit_vec(data.len());
1150        let n_threads = rayon::current_num_threads().max(1);
1151        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1152
1153        // Process chunks in parallel: each thread writes to its slice of buf
1154        data.par_chunks(chunk_size)
1155            .zip(buf.par_chunks_mut(chunk_size))
1156            .for_each(|(src_chunk, dst_chunk)| {
1157                translate_range_simd(src_chunk, &mut dst_chunk[..src_chunk.len()], lo, hi, offset);
1158            });
1159
1160        return writer.write_all(&buf);
1161    }
1162
1163    // Small data: single-threaded SIMD
1164    if data.len() <= SINGLE_WRITE_LIMIT {
1165        let mut buf = alloc_uninit_vec(data.len());
1166        translate_range_simd(data, &mut buf, lo, hi, offset);
1167        return writer.write_all(&buf);
1168    }
1169    // Chunked path for large data (shouldn't happen since PARALLEL_THRESHOLD < SINGLE_WRITE_LIMIT)
1170    let mut buf = alloc_uninit_vec(BUF_SIZE);
1171    for chunk in data.chunks(BUF_SIZE) {
1172        translate_range_simd(chunk, &mut buf[..chunk.len()], lo, hi, offset);
1173        writer.write_all(&buf[..chunk.len()])?;
1174    }
1175    Ok(())
1176}
1177
1178/// General table-lookup translate for mmap data, with rayon parallel processing.
1179fn translate_mmap_table(data: &[u8], writer: &mut impl Write, table: &[u8; 256]) -> io::Result<()> {
1180    // Parallel path: split data into chunks, translate each in parallel
1181    if data.len() >= PARALLEL_THRESHOLD {
1182        let mut buf = alloc_uninit_vec(data.len());
1183        let n_threads = rayon::current_num_threads().max(1);
1184        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1185
1186        data.par_chunks(chunk_size)
1187            .zip(buf.par_chunks_mut(chunk_size))
1188            .for_each(|(src_chunk, dst_chunk)| {
1189                translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], table);
1190            });
1191
1192        return writer.write_all(&buf);
1193    }
1194
1195    // Small data: single-threaded
1196    if data.len() <= SINGLE_WRITE_LIMIT {
1197        let mut buf = alloc_uninit_vec(data.len());
1198        translate_to(data, &mut buf, table);
1199        return writer.write_all(&buf);
1200    }
1201    let mut buf = alloc_uninit_vec(BUF_SIZE);
1202    for chunk in data.chunks(BUF_SIZE) {
1203        translate_to(chunk, &mut buf[..chunk.len()], table);
1204        writer.write_all(&buf[..chunk.len()])?;
1205    }
1206    Ok(())
1207}
1208
1209/// Translate + squeeze from mmap'd byte slice.
1210///
1211/// For data >= 2MB: two-phase approach: parallel translate, then sequential squeeze.
1212/// For data <= 16MB: single-pass translate+squeeze into one buffer, one write syscall.
1213/// For data > 16MB: chunked approach to limit memory.
1214pub fn translate_squeeze_mmap(
1215    set1: &[u8],
1216    set2: &[u8],
1217    data: &[u8],
1218    writer: &mut impl Write,
1219) -> io::Result<()> {
1220    let table = build_translate_table(set1, set2);
1221    let squeeze_set = build_member_set(set2);
1222
1223    // For large data: two-phase approach
1224    // Phase 1: parallel translate into buffer
1225    // Phase 2: sequential squeeze IN-PLACE on the translated buffer
1226    //          (squeeze only removes bytes, never grows, so no second allocation needed)
1227    if data.len() >= PARALLEL_THRESHOLD {
1228        // Phase 1: parallel translate
1229        let mut translated = alloc_uninit_vec(data.len());
1230        let range_info = detect_range_offset(&table);
1231        let n_threads = rayon::current_num_threads().max(1);
1232        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1233
1234        if let Some((lo, hi, offset)) = range_info {
1235            data.par_chunks(chunk_size)
1236                .zip(translated.par_chunks_mut(chunk_size))
1237                .for_each(|(src_chunk, dst_chunk)| {
1238                    translate_range_simd(
1239                        src_chunk,
1240                        &mut dst_chunk[..src_chunk.len()],
1241                        lo,
1242                        hi,
1243                        offset,
1244                    );
1245                });
1246        } else {
1247            data.par_chunks(chunk_size)
1248                .zip(translated.par_chunks_mut(chunk_size))
1249                .for_each(|(src_chunk, dst_chunk)| {
1250                    translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], &table);
1251                });
1252        }
1253
1254        // Phase 2: squeeze in-place on the translated buffer.
1255        // Since squeeze only removes bytes (never grows), we can read ahead and
1256        // compact into the same buffer, saving a full data.len() heap allocation.
1257        let mut last_squeezed: u16 = 256;
1258        let len = translated.len();
1259        let mut wp = 0;
1260        unsafe {
1261            let ptr = translated.as_mut_ptr();
1262            let mut i = 0;
1263            while i < len {
1264                let b = *ptr.add(i);
1265                if is_member(&squeeze_set, b) {
1266                    if last_squeezed == b as u16 {
1267                        i += 1;
1268                        continue;
1269                    }
1270                    last_squeezed = b as u16;
1271                } else {
1272                    last_squeezed = 256;
1273                }
1274                *ptr.add(wp) = b;
1275                wp += 1;
1276                i += 1;
1277            }
1278        }
1279        return writer.write_all(&translated[..wp]);
1280    }
1281
1282    // Single-write fast path: translate+squeeze all data in one pass, one write
1283    if data.len() <= SINGLE_WRITE_LIMIT {
1284        let mut buf: Vec<u8> = Vec::with_capacity(data.len());
1285        let mut last_squeezed: u16 = 256;
1286        unsafe {
1287            buf.set_len(data.len());
1288            let outp: *mut u8 = buf.as_mut_ptr();
1289            let inp = data.as_ptr();
1290            let len = data.len();
1291            let mut wp = 0;
1292            let mut i = 0;
1293            while i < len {
1294                let translated = *table.get_unchecked(*inp.add(i) as usize);
1295                if is_member(&squeeze_set, translated) {
1296                    if last_squeezed == translated as u16 {
1297                        i += 1;
1298                        continue;
1299                    }
1300                    last_squeezed = translated as u16;
1301                } else {
1302                    last_squeezed = 256;
1303                }
1304                *outp.add(wp) = translated;
1305                wp += 1;
1306                i += 1;
1307            }
1308            buf.set_len(wp);
1309        }
1310        return writer.write_all(&buf);
1311    }
1312
1313    // Chunked path for large data
1314    let buf_size = data.len().min(BUF_SIZE);
1315    let mut buf = vec![0u8; buf_size];
1316    let mut last_squeezed: u16 = 256;
1317
1318    for chunk in data.chunks(buf_size) {
1319        translate_to(chunk, &mut buf[..chunk.len()], &table);
1320        let mut wp = 0;
1321        unsafe {
1322            let ptr = buf.as_mut_ptr();
1323            for i in 0..chunk.len() {
1324                let b = *ptr.add(i);
1325                if is_member(&squeeze_set, b) {
1326                    if last_squeezed == b as u16 {
1327                        continue;
1328                    }
1329                    last_squeezed = b as u16;
1330                } else {
1331                    last_squeezed = 256;
1332                }
1333                *ptr.add(wp) = b;
1334                wp += 1;
1335            }
1336        }
1337        writer.write_all(&buf[..wp])?;
1338    }
1339    Ok(())
1340}
1341
1342/// Delete from mmap'd byte slice.
1343///
1344/// For data >= 2MB: uses rayon parallel processing across multiple cores.
1345/// For data <= 16MB: delete into one buffer, one write syscall.
1346/// For data > 16MB: chunked approach to limit memory.
1347pub fn delete_mmap(delete_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1348    if delete_chars.len() == 1 {
1349        return delete_single_char_mmap(delete_chars[0], data, writer);
1350    }
1351    if delete_chars.len() <= 3 {
1352        return delete_multi_memchr_mmap(delete_chars, data, writer);
1353    }
1354
1355    // SIMD fast path for contiguous ranges (digits, a-z, A-Z, etc.)
1356    if let Some((lo, hi)) = detect_delete_range(delete_chars) {
1357        return delete_range_mmap(data, writer, lo, hi);
1358    }
1359
1360    let member = build_member_set(delete_chars);
1361
1362    // Parallel path: pre-allocate a single output buffer of data.len() and have each
1363    // thread write to its non-overlapping slice, then do a single write_all.
1364    // This avoids per-chunk Vec allocations that the old approach had.
1365    if data.len() >= PARALLEL_THRESHOLD {
1366        let n_threads = rayon::current_num_threads().max(1);
1367        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1368
1369        // Each thread deletes into its slice of outbuf and returns bytes written.
1370        let mut outbuf = alloc_uninit_vec(data.len());
1371        let chunk_lens: Vec<usize> = data
1372            .par_chunks(chunk_size)
1373            .zip(outbuf.par_chunks_mut(chunk_size))
1374            .map(|(src_chunk, dst_chunk)| delete_chunk_bitset_into(src_chunk, &member, dst_chunk))
1375            .collect();
1376
1377        // Compact: move each chunk's output to be contiguous.
1378        // chunk_lens[i] is how many bytes thread i wrote into its slice.
1379        // We need to shift them together since each dst_chunk started at chunk_size offsets.
1380        let mut write_pos = 0;
1381        let mut src_offset = 0;
1382        for &clen in &chunk_lens {
1383            if clen > 0 && src_offset != write_pos {
1384                unsafe {
1385                    std::ptr::copy(
1386                        outbuf.as_ptr().add(src_offset),
1387                        outbuf.as_mut_ptr().add(write_pos),
1388                        clen,
1389                    );
1390                }
1391            }
1392            write_pos += clen;
1393            src_offset += chunk_size;
1394        }
1395
1396        return writer.write_all(&outbuf[..write_pos]);
1397    }
1398
1399    // Single-write fast path: delete into one buffer, one write
1400    if data.len() <= SINGLE_WRITE_LIMIT {
1401        let mut outbuf = alloc_uninit_vec(data.len());
1402        let out_pos = delete_chunk_bitset_into(data, &member, &mut outbuf);
1403        return writer.write_all(&outbuf[..out_pos]);
1404    }
1405
1406    // Chunked path for large data
1407    let buf_size = data.len().min(BUF_SIZE);
1408    let mut outbuf = alloc_uninit_vec(buf_size);
1409
1410    for chunk in data.chunks(buf_size) {
1411        let out_pos = delete_chunk_bitset_into(chunk, &member, &mut outbuf);
1412        writer.write_all(&outbuf[..out_pos])?;
1413    }
1414    Ok(())
1415}
1416
1417/// SIMD range delete for mmap data, with rayon parallel processing.
1418fn delete_range_mmap(data: &[u8], writer: &mut impl Write, lo: u8, hi: u8) -> io::Result<()> {
1419    // Parallel path: each thread deletes from its chunk into a local Vec
1420    if data.len() >= PARALLEL_THRESHOLD {
1421        let n_threads = rayon::current_num_threads().max(1);
1422        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1423
1424        let results: Vec<Vec<u8>> = data
1425            .par_chunks(chunk_size)
1426            .map(|chunk| {
1427                let mut out = alloc_uninit_vec(chunk.len());
1428                let wp = delete_range_chunk(chunk, &mut out, lo, hi);
1429                unsafe { out.set_len(wp) };
1430                out
1431            })
1432            .collect();
1433
1434        let slices: Vec<std::io::IoSlice> = results
1435            .iter()
1436            .filter(|r| !r.is_empty())
1437            .map(|r| std::io::IoSlice::new(r))
1438            .collect();
1439        return write_ioslices(writer, &slices);
1440    }
1441
1442    // Single-write fast path
1443    if data.len() <= SINGLE_WRITE_LIMIT {
1444        let mut outbuf = alloc_uninit_vec(data.len());
1445        let wp = delete_range_chunk(data, &mut outbuf, lo, hi);
1446        return writer.write_all(&outbuf[..wp]);
1447    }
1448
1449    // Chunked path
1450    let mut outbuf = alloc_uninit_vec(BUF_SIZE);
1451    for chunk in data.chunks(BUF_SIZE) {
1452        let wp = delete_range_chunk(chunk, &mut outbuf, lo, hi);
1453        writer.write_all(&outbuf[..wp])?;
1454    }
1455    Ok(())
1456}
1457
1458/// Delete bytes from chunk using bitset, writing into pre-allocated buffer.
1459/// Returns number of bytes written.
1460#[inline]
1461fn delete_chunk_bitset_into(chunk: &[u8], member: &[u8; 32], outbuf: &mut [u8]) -> usize {
1462    let len = chunk.len();
1463    let mut out_pos = 0;
1464    let mut i = 0;
1465
1466    while i + 8 <= len {
1467        unsafe {
1468            let b0 = *chunk.get_unchecked(i);
1469            let b1 = *chunk.get_unchecked(i + 1);
1470            let b2 = *chunk.get_unchecked(i + 2);
1471            let b3 = *chunk.get_unchecked(i + 3);
1472            let b4 = *chunk.get_unchecked(i + 4);
1473            let b5 = *chunk.get_unchecked(i + 5);
1474            let b6 = *chunk.get_unchecked(i + 6);
1475            let b7 = *chunk.get_unchecked(i + 7);
1476
1477            *outbuf.get_unchecked_mut(out_pos) = b0;
1478            out_pos += !is_member(member, b0) as usize;
1479            *outbuf.get_unchecked_mut(out_pos) = b1;
1480            out_pos += !is_member(member, b1) as usize;
1481            *outbuf.get_unchecked_mut(out_pos) = b2;
1482            out_pos += !is_member(member, b2) as usize;
1483            *outbuf.get_unchecked_mut(out_pos) = b3;
1484            out_pos += !is_member(member, b3) as usize;
1485            *outbuf.get_unchecked_mut(out_pos) = b4;
1486            out_pos += !is_member(member, b4) as usize;
1487            *outbuf.get_unchecked_mut(out_pos) = b5;
1488            out_pos += !is_member(member, b5) as usize;
1489            *outbuf.get_unchecked_mut(out_pos) = b6;
1490            out_pos += !is_member(member, b6) as usize;
1491            *outbuf.get_unchecked_mut(out_pos) = b7;
1492            out_pos += !is_member(member, b7) as usize;
1493        }
1494        i += 8;
1495    }
1496
1497    while i < len {
1498        unsafe {
1499            let b = *chunk.get_unchecked(i);
1500            *outbuf.get_unchecked_mut(out_pos) = b;
1501            out_pos += !is_member(member, b) as usize;
1502        }
1503        i += 1;
1504    }
1505
1506    out_pos
1507}
1508
1509fn delete_single_char_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1510    // Parallel path for large data: each thread deletes from its chunk,
1511    // then use writev to write all results in one syscall batch.
1512    if data.len() >= PARALLEL_THRESHOLD {
1513        let n_threads = rayon::current_num_threads().max(1);
1514        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1515
1516        let results: Vec<Vec<u8>> = data
1517            .par_chunks(chunk_size)
1518            .map(|chunk| {
1519                let mut out = Vec::with_capacity(chunk.len());
1520                let mut last = 0;
1521                for pos in memchr::memchr_iter(ch, chunk) {
1522                    if pos > last {
1523                        out.extend_from_slice(&chunk[last..pos]);
1524                    }
1525                    last = pos + 1;
1526                }
1527                if last < chunk.len() {
1528                    out.extend_from_slice(&chunk[last..]);
1529                }
1530                out
1531            })
1532            .collect();
1533
1534        // Use writev to batch all results into fewer syscalls
1535        let slices: Vec<std::io::IoSlice> = results
1536            .iter()
1537            .filter(|r| !r.is_empty())
1538            .map(|r| std::io::IoSlice::new(r))
1539            .collect();
1540        return write_ioslices(writer, &slices);
1541    }
1542
1543    // Single-write fast path: collect all non-deleted spans into one buffer
1544    if data.len() <= SINGLE_WRITE_LIMIT {
1545        let mut outbuf = Vec::with_capacity(data.len());
1546        let mut last = 0;
1547        for pos in memchr::memchr_iter(ch, data) {
1548            if pos > last {
1549                outbuf.extend_from_slice(&data[last..pos]);
1550            }
1551            last = pos + 1;
1552        }
1553        if last < data.len() {
1554            outbuf.extend_from_slice(&data[last..]);
1555        }
1556        return writer.write_all(&outbuf);
1557    }
1558
1559    // Chunked path for large data
1560    let buf_size = data.len().min(BUF_SIZE);
1561    let mut outbuf = vec![0u8; buf_size];
1562
1563    for chunk in data.chunks(buf_size) {
1564        let mut wp = 0;
1565        let mut last = 0;
1566        for pos in memchr::memchr_iter(ch, chunk) {
1567            if pos > last {
1568                let run = pos - last;
1569                outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
1570                wp += run;
1571            }
1572            last = pos + 1;
1573        }
1574        if last < chunk.len() {
1575            let run = chunk.len() - last;
1576            outbuf[wp..wp + run].copy_from_slice(&chunk[last..]);
1577            wp += run;
1578        }
1579        writer.write_all(&outbuf[..wp])?;
1580    }
1581    Ok(())
1582}
1583
1584fn delete_multi_memchr_mmap(chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1585    let c0 = chars[0];
1586    let c1 = if chars.len() >= 2 { chars[1] } else { 0 };
1587    let c2 = if chars.len() >= 3 { chars[2] } else { 0 };
1588    let is_three = chars.len() >= 3;
1589
1590    // Parallel path for large data
1591    if data.len() >= PARALLEL_THRESHOLD {
1592        let n_threads = rayon::current_num_threads().max(1);
1593        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1594
1595        let results: Vec<Vec<u8>> = data
1596            .par_chunks(chunk_size)
1597            .map(|chunk| {
1598                let mut out = Vec::with_capacity(chunk.len());
1599                let mut last = 0;
1600                if is_three {
1601                    for pos in memchr::memchr3_iter(c0, c1, c2, chunk) {
1602                        if pos > last {
1603                            out.extend_from_slice(&chunk[last..pos]);
1604                        }
1605                        last = pos + 1;
1606                    }
1607                } else {
1608                    for pos in memchr::memchr2_iter(c0, c1, chunk) {
1609                        if pos > last {
1610                            out.extend_from_slice(&chunk[last..pos]);
1611                        }
1612                        last = pos + 1;
1613                    }
1614                }
1615                if last < chunk.len() {
1616                    out.extend_from_slice(&chunk[last..]);
1617                }
1618                out
1619            })
1620            .collect();
1621
1622        // Use writev to batch all results into fewer syscalls
1623        let slices: Vec<std::io::IoSlice> = results
1624            .iter()
1625            .filter(|r| !r.is_empty())
1626            .map(|r| std::io::IoSlice::new(r))
1627            .collect();
1628        return write_ioslices(writer, &slices);
1629    }
1630
1631    // Single-write fast path: collect all non-deleted spans into one buffer
1632    if data.len() <= SINGLE_WRITE_LIMIT {
1633        let mut outbuf = Vec::with_capacity(data.len());
1634        let mut last = 0;
1635        if is_three {
1636            for pos in memchr::memchr3_iter(c0, c1, c2, data) {
1637                if pos > last {
1638                    outbuf.extend_from_slice(&data[last..pos]);
1639                }
1640                last = pos + 1;
1641            }
1642        } else {
1643            for pos in memchr::memchr2_iter(c0, c1, data) {
1644                if pos > last {
1645                    outbuf.extend_from_slice(&data[last..pos]);
1646                }
1647                last = pos + 1;
1648            }
1649        }
1650        if last < data.len() {
1651            outbuf.extend_from_slice(&data[last..]);
1652        }
1653        return writer.write_all(&outbuf);
1654    }
1655
1656    // Chunked path for large data
1657    let buf_size = data.len().min(BUF_SIZE);
1658    let mut outbuf = vec![0u8; buf_size];
1659
1660    for chunk in data.chunks(buf_size) {
1661        let mut wp = 0;
1662        let mut last = 0;
1663
1664        // Iterate directly over memchr iterator without collecting into Vec<usize>.
1665        // Positions are used exactly once in order, so no intermediate allocation needed.
1666        if is_three {
1667            for pos in memchr::memchr3_iter(c0, c1, c2, chunk) {
1668                if pos > last {
1669                    let run = pos - last;
1670                    outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
1671                    wp += run;
1672                }
1673                last = pos + 1;
1674            }
1675        } else {
1676            for pos in memchr::memchr2_iter(c0, c1, chunk) {
1677                if pos > last {
1678                    let run = pos - last;
1679                    outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
1680                    wp += run;
1681                }
1682                last = pos + 1;
1683            }
1684        }
1685
1686        if last < chunk.len() {
1687            let run = chunk.len() - last;
1688            outbuf[wp..wp + run].copy_from_slice(&chunk[last..]);
1689            wp += run;
1690        }
1691        writer.write_all(&outbuf[..wp])?;
1692    }
1693    Ok(())
1694}
1695
1696/// Delete + squeeze from mmap'd byte slice.
1697///
1698/// For data <= 16MB: delete+squeeze into one buffer, one write syscall.
1699/// For data > 16MB: chunked approach to limit memory.
1700pub fn delete_squeeze_mmap(
1701    delete_chars: &[u8],
1702    squeeze_chars: &[u8],
1703    data: &[u8],
1704    writer: &mut impl Write,
1705) -> io::Result<()> {
1706    let delete_set = build_member_set(delete_chars);
1707    let squeeze_set = build_member_set(squeeze_chars);
1708
1709    // Single-write fast path: delete+squeeze all data in one pass, one write
1710    if data.len() <= SINGLE_WRITE_LIMIT {
1711        let mut outbuf: Vec<u8> = Vec::with_capacity(data.len());
1712        let mut last_squeezed: u16 = 256;
1713        unsafe {
1714            outbuf.set_len(data.len());
1715            let outp: *mut u8 = outbuf.as_mut_ptr();
1716            let inp = data.as_ptr();
1717            let len = data.len();
1718            let mut out_pos = 0;
1719            let mut i = 0;
1720            while i < len {
1721                let b = *inp.add(i);
1722                if is_member(&delete_set, b) {
1723                    i += 1;
1724                    continue;
1725                }
1726                if is_member(&squeeze_set, b) {
1727                    if last_squeezed == b as u16 {
1728                        i += 1;
1729                        continue;
1730                    }
1731                    last_squeezed = b as u16;
1732                } else {
1733                    last_squeezed = 256;
1734                }
1735                *outp.add(out_pos) = b;
1736                out_pos += 1;
1737                i += 1;
1738            }
1739            outbuf.set_len(out_pos);
1740        }
1741        return writer.write_all(&outbuf);
1742    }
1743
1744    // Chunked path for large data
1745    let buf_size = data.len().min(BUF_SIZE);
1746    let mut outbuf = vec![0u8; buf_size];
1747    let mut last_squeezed: u16 = 256;
1748
1749    for chunk in data.chunks(buf_size) {
1750        let mut out_pos = 0;
1751        for &b in chunk {
1752            if is_member(&delete_set, b) {
1753                continue;
1754            }
1755            if is_member(&squeeze_set, b) {
1756                if last_squeezed == b as u16 {
1757                    continue;
1758                }
1759                last_squeezed = b as u16;
1760            } else {
1761                last_squeezed = 256;
1762            }
1763            unsafe {
1764                *outbuf.get_unchecked_mut(out_pos) = b;
1765            }
1766            out_pos += 1;
1767        }
1768        writer.write_all(&outbuf[..out_pos])?;
1769    }
1770    Ok(())
1771}
1772
1773/// Squeeze from mmap'd byte slice.
1774///
1775/// For data >= 2MB: uses rayon parallel processing with boundary fixup.
1776/// For data <= 16MB: squeeze into one buffer, one write syscall.
1777/// For data > 16MB: chunked approach to limit memory.
1778pub fn squeeze_mmap(squeeze_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1779    if squeeze_chars.len() == 1 {
1780        return squeeze_single_mmap(squeeze_chars[0], data, writer);
1781    }
1782    if squeeze_chars.len() == 2 {
1783        return squeeze_multi_mmap::<2>(squeeze_chars, data, writer);
1784    }
1785    if squeeze_chars.len() == 3 {
1786        return squeeze_multi_mmap::<3>(squeeze_chars, data, writer);
1787    }
1788
1789    let member = build_member_set(squeeze_chars);
1790
1791    // Parallel path: squeeze each chunk independently, then fix boundaries
1792    if data.len() >= PARALLEL_THRESHOLD {
1793        let n_threads = rayon::current_num_threads().max(1);
1794        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1795
1796        let results: Vec<Vec<u8>> = data
1797            .par_chunks(chunk_size)
1798            .map(|chunk| squeeze_chunk_bitset(chunk, &member))
1799            .collect();
1800
1801        // Build IoSlice list, fixing boundaries: if chunk N ends with byte B
1802        // and chunk N+1 starts with same byte B, and B is in squeeze set,
1803        // skip the first byte(s) of chunk N+1 that equal B.
1804        // Collect slices for writev to minimize syscalls.
1805        let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
1806        for (idx, result) in results.iter().enumerate() {
1807            if result.is_empty() {
1808                continue;
1809            }
1810            if idx > 0 {
1811                // Check boundary: does previous chunk end with same squeezable byte?
1812                if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
1813                    if is_member(&member, prev_last) {
1814                        // Skip leading bytes in this chunk that equal prev_last
1815                        let skip = result.iter().take_while(|&&b| b == prev_last).count();
1816                        if skip < result.len() {
1817                            slices.push(std::io::IoSlice::new(&result[skip..]));
1818                        }
1819                        continue;
1820                    }
1821                }
1822            }
1823            slices.push(std::io::IoSlice::new(result));
1824        }
1825        return write_ioslices(writer, &slices);
1826    }
1827
1828    // Single-write fast path: squeeze all data into one buffer, one write
1829    if data.len() <= SINGLE_WRITE_LIMIT {
1830        let mut outbuf: Vec<u8> = Vec::with_capacity(data.len());
1831        let mut last_squeezed: u16 = 256;
1832        let len = data.len();
1833        let mut wp = 0;
1834        let mut i = 0;
1835
1836        unsafe {
1837            outbuf.set_len(data.len());
1838            let inp = data.as_ptr();
1839            let outp: *mut u8 = outbuf.as_mut_ptr();
1840
1841            while i < len {
1842                let b = *inp.add(i);
1843                if is_member(&member, b) {
1844                    if last_squeezed != b as u16 {
1845                        *outp.add(wp) = b;
1846                        wp += 1;
1847                        last_squeezed = b as u16;
1848                    }
1849                    i += 1;
1850                    while i < len && *inp.add(i) == b {
1851                        i += 1;
1852                    }
1853                } else {
1854                    last_squeezed = 256;
1855                    *outp.add(wp) = b;
1856                    wp += 1;
1857                    i += 1;
1858                }
1859            }
1860            outbuf.set_len(wp);
1861        }
1862        return writer.write_all(&outbuf);
1863    }
1864
1865    // Chunked path for large data
1866    let buf_size = data.len().min(BUF_SIZE);
1867    let mut outbuf = vec![0u8; buf_size];
1868    let mut last_squeezed: u16 = 256;
1869
1870    for chunk in data.chunks(buf_size) {
1871        let len = chunk.len();
1872        let mut wp = 0;
1873        let mut i = 0;
1874
1875        unsafe {
1876            let inp = chunk.as_ptr();
1877            let outp = outbuf.as_mut_ptr();
1878
1879            while i < len {
1880                let b = *inp.add(i);
1881                if is_member(&member, b) {
1882                    if last_squeezed != b as u16 {
1883                        *outp.add(wp) = b;
1884                        wp += 1;
1885                        last_squeezed = b as u16;
1886                    }
1887                    i += 1;
1888                    while i < len && *inp.add(i) == b {
1889                        i += 1;
1890                    }
1891                } else {
1892                    last_squeezed = 256;
1893                    *outp.add(wp) = b;
1894                    wp += 1;
1895                    i += 1;
1896                }
1897            }
1898        }
1899        writer.write_all(&outbuf[..wp])?;
1900    }
1901    Ok(())
1902}
1903
1904/// Squeeze a single chunk using bitset membership. Returns squeezed output.
1905fn squeeze_chunk_bitset(chunk: &[u8], member: &[u8; 32]) -> Vec<u8> {
1906    let len = chunk.len();
1907    let mut out = Vec::with_capacity(len);
1908    let mut last_squeezed: u16 = 256;
1909    let mut i = 0;
1910
1911    unsafe {
1912        out.set_len(len);
1913        let inp = chunk.as_ptr();
1914        let outp: *mut u8 = out.as_mut_ptr();
1915        let mut wp = 0;
1916
1917        while i < len {
1918            let b = *inp.add(i);
1919            if is_member(member, b) {
1920                if last_squeezed != b as u16 {
1921                    *outp.add(wp) = b;
1922                    wp += 1;
1923                    last_squeezed = b as u16;
1924                }
1925                i += 1;
1926                while i < len && *inp.add(i) == b {
1927                    i += 1;
1928                }
1929            } else {
1930                last_squeezed = 256;
1931                *outp.add(wp) = b;
1932                wp += 1;
1933                i += 1;
1934            }
1935        }
1936        out.set_len(wp);
1937    }
1938    out
1939}
1940
1941fn squeeze_multi_mmap<const N: usize>(
1942    chars: &[u8],
1943    data: &[u8],
1944    writer: &mut impl Write,
1945) -> io::Result<()> {
1946    // Parallel path for large data: squeeze each chunk, fix boundaries with writev
1947    if data.len() >= PARALLEL_THRESHOLD {
1948        let member = build_member_set(chars);
1949        let n_threads = rayon::current_num_threads().max(1);
1950        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1951
1952        let results: Vec<Vec<u8>> = data
1953            .par_chunks(chunk_size)
1954            .map(|chunk| squeeze_chunk_bitset(chunk, &member))
1955            .collect();
1956
1957        // Build IoSlice list, fixing boundaries
1958        let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
1959        for (idx, result) in results.iter().enumerate() {
1960            if result.is_empty() {
1961                continue;
1962            }
1963            if idx > 0 {
1964                if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
1965                    if is_member(&member, prev_last) {
1966                        let skip = result.iter().take_while(|&&b| b == prev_last).count();
1967                        if skip < result.len() {
1968                            slices.push(std::io::IoSlice::new(&result[skip..]));
1969                        }
1970                        continue;
1971                    }
1972                }
1973            }
1974            slices.push(std::io::IoSlice::new(result));
1975        }
1976        return write_ioslices(writer, &slices);
1977    }
1978
1979    let buf_size = data.len().min(BUF_SIZE);
1980    let mut outbuf = vec![0u8; buf_size];
1981    let mut wp = 0;
1982    let mut last_squeezed: u16 = 256;
1983    let mut cursor = 0;
1984
1985    macro_rules! find_next {
1986        ($data:expr) => {
1987            if N == 2 {
1988                memchr::memchr2(chars[0], chars[1], $data)
1989            } else {
1990                memchr::memchr3(chars[0], chars[1], chars[2], $data)
1991            }
1992        };
1993    }
1994
1995    macro_rules! flush_and_copy {
1996        ($src:expr, $len:expr) => {
1997            if wp + $len > buf_size {
1998                writer.write_all(&outbuf[..wp])?;
1999                wp = 0;
2000            }
2001            if $len > buf_size {
2002                writer.write_all($src)?;
2003            } else {
2004                outbuf[wp..wp + $len].copy_from_slice($src);
2005                wp += $len;
2006            }
2007        };
2008    }
2009
2010    while cursor < data.len() {
2011        match find_next!(&data[cursor..]) {
2012            Some(offset) => {
2013                let pos = cursor + offset;
2014                let b = data[pos];
2015                if pos > cursor {
2016                    let span = pos - cursor;
2017                    flush_and_copy!(&data[cursor..pos], span);
2018                    last_squeezed = 256;
2019                }
2020                if last_squeezed != b as u16 {
2021                    if wp >= buf_size {
2022                        writer.write_all(&outbuf[..wp])?;
2023                        wp = 0;
2024                    }
2025                    outbuf[wp] = b;
2026                    wp += 1;
2027                    last_squeezed = b as u16;
2028                }
2029                let mut skip = pos + 1;
2030                while skip < data.len() && data[skip] == b {
2031                    skip += 1;
2032                }
2033                cursor = skip;
2034            }
2035            None => {
2036                let remaining = data.len() - cursor;
2037                flush_and_copy!(&data[cursor..], remaining);
2038                break;
2039            }
2040        }
2041    }
2042    if wp > 0 {
2043        writer.write_all(&outbuf[..wp])?;
2044    }
2045    Ok(())
2046}
2047
2048fn squeeze_single_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2049    if data.is_empty() {
2050        return Ok(());
2051    }
2052
2053    if memchr::memmem::find(data, &[ch, ch]).is_none() {
2054        return writer.write_all(data);
2055    }
2056
2057    // Parallel path: squeeze each chunk, fix boundaries
2058    if data.len() >= PARALLEL_THRESHOLD {
2059        let n_threads = rayon::current_num_threads().max(1);
2060        let chunk_size = (data.len() / n_threads).max(32 * 1024);
2061
2062        let results: Vec<Vec<u8>> = data
2063            .par_chunks(chunk_size)
2064            .map(|chunk| {
2065                let mut out = Vec::with_capacity(chunk.len());
2066                let mut cursor = 0;
2067                while cursor < chunk.len() {
2068                    match memchr::memchr(ch, &chunk[cursor..]) {
2069                        Some(offset) => {
2070                            let pos = cursor + offset;
2071                            if pos > cursor {
2072                                out.extend_from_slice(&chunk[cursor..pos]);
2073                            }
2074                            out.push(ch);
2075                            cursor = pos + 1;
2076                            while cursor < chunk.len() && chunk[cursor] == ch {
2077                                cursor += 1;
2078                            }
2079                        }
2080                        None => {
2081                            out.extend_from_slice(&chunk[cursor..]);
2082                            break;
2083                        }
2084                    }
2085                }
2086                out
2087            })
2088            .collect();
2089
2090        // Build IoSlice list, fixing boundary squeezability.
2091        // Use writev to minimize syscalls.
2092        let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
2093        for (idx, result) in results.iter().enumerate() {
2094            if result.is_empty() {
2095                continue;
2096            }
2097            if idx > 0 {
2098                if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
2099                    if prev_last == ch {
2100                        // Skip leading ch bytes in this chunk result
2101                        let skip = result.iter().take_while(|&&b| b == ch).count();
2102                        if skip < result.len() {
2103                            slices.push(std::io::IoSlice::new(&result[skip..]));
2104                        }
2105                        continue;
2106                    }
2107                }
2108            }
2109            slices.push(std::io::IoSlice::new(result));
2110        }
2111        return write_ioslices(writer, &slices);
2112    }
2113
2114    let buf_size = data.len().min(BUF_SIZE);
2115    let mut outbuf = vec![0u8; buf_size];
2116    let len = data.len();
2117    let mut wp = 0;
2118    let mut cursor = 0;
2119
2120    while cursor < len {
2121        match memchr::memchr(ch, &data[cursor..]) {
2122            Some(offset) => {
2123                let pos = cursor + offset;
2124                let gap = pos - cursor;
2125                if gap > 0 {
2126                    if wp + gap > buf_size {
2127                        writer.write_all(&outbuf[..wp])?;
2128                        wp = 0;
2129                    }
2130                    if gap > buf_size {
2131                        writer.write_all(&data[cursor..pos])?;
2132                    } else {
2133                        outbuf[wp..wp + gap].copy_from_slice(&data[cursor..pos]);
2134                        wp += gap;
2135                    }
2136                }
2137                if wp >= buf_size {
2138                    writer.write_all(&outbuf[..wp])?;
2139                    wp = 0;
2140                }
2141                outbuf[wp] = ch;
2142                wp += 1;
2143                cursor = pos + 1;
2144                while cursor < len && data[cursor] == ch {
2145                    cursor += 1;
2146                }
2147            }
2148            None => {
2149                let remaining = len - cursor;
2150                if remaining > 0 {
2151                    if wp + remaining > buf_size {
2152                        writer.write_all(&outbuf[..wp])?;
2153                        wp = 0;
2154                    }
2155                    if remaining > buf_size {
2156                        writer.write_all(&data[cursor..])?;
2157                    } else {
2158                        outbuf[wp..wp + remaining].copy_from_slice(&data[cursor..]);
2159                        wp += remaining;
2160                    }
2161                }
2162                break;
2163            }
2164        }
2165    }
2166
2167    if wp > 0 {
2168        writer.write_all(&outbuf[..wp])?;
2169    }
2170    Ok(())
2171}