Skip to main content

coreutils_rs/tr/
core.rs

1use std::io::{self, Read, Write};
2
3use rayon::prelude::*;
4
5/// Maximum IoSlice entries per write_vectored batch.
6/// Linux UIO_MAXIOV is 1024; we use that as our batch limit.
7const MAX_IOV: usize = 1024;
8
9/// Main processing buffer: 4MB (fits in L3 cache, avoids cache thrashing).
10const BUF_SIZE: usize = 4 * 1024 * 1024;
11
12/// Stream buffer: 1MB — fits in L2/L3 cache so data stays hot between read,
13/// process, and write phases. Larger buffers (4-8MB) cause cache thrashing.
14const STREAM_BUF: usize = 1024 * 1024;
15
16/// Translate stream buffer: 4MB — translate is compute-light (single table lookup
17/// per byte) so the bottleneck is I/O syscalls, not cache pressure. Larger buffer
18/// means fewer read()/write() syscall pairs (3 for 10MB instead of 10).
19const TRANSLATE_STREAM_BUF: usize = 4 * 1024 * 1024;
20
21/// Minimum data size to engage rayon parallel processing for mmap paths.
22/// Below this, single-threaded is faster due to thread pool overhead.
23const PARALLEL_THRESHOLD: usize = 2 * 1024 * 1024;
24
25/// Write multiple IoSlice buffers using write_vectored, batching into MAX_IOV-sized groups.
26/// Falls back to write_all per slice for partial writes.
27#[inline]
28fn write_ioslices(writer: &mut impl Write, slices: &[std::io::IoSlice]) -> io::Result<()> {
29    if slices.is_empty() {
30        return Ok(());
31    }
32    for batch in slices.chunks(MAX_IOV) {
33        let total: usize = batch.iter().map(|s| s.len()).sum();
34        match writer.write_vectored(batch) {
35            Ok(n) if n >= total => continue,
36            Ok(mut written) => {
37                // Partial write: fall back to write_all per remaining slice
38                for slice in batch {
39                    let slen = slice.len();
40                    if written >= slen {
41                        written -= slen;
42                        continue;
43                    }
44                    if written > 0 {
45                        writer.write_all(&slice[written..])?;
46                        written = 0;
47                    } else {
48                        writer.write_all(slice)?;
49                    }
50                }
51            }
52            Err(e) => return Err(e),
53        }
54    }
55    Ok(())
56}
57
58/// Allocate a Vec<u8> of given length without zero-initialization.
59/// SAFETY: Caller must write all bytes before reading them.
60#[inline]
61#[allow(clippy::uninit_vec)]
62fn alloc_uninit_vec(len: usize) -> Vec<u8> {
63    let mut v = Vec::with_capacity(len);
64    // SAFETY: u8 has no drop, no invalid bit patterns; caller will overwrite before reading
65    unsafe {
66        v.set_len(len);
67    }
68    v
69}
70
71/// Build a 256-byte lookup table mapping set1[i] -> set2[i].
72#[inline]
73fn build_translate_table(set1: &[u8], set2: &[u8]) -> [u8; 256] {
74    let mut table: [u8; 256] = std::array::from_fn(|i| i as u8);
75    let last = set2.last().copied();
76    for (i, &from) in set1.iter().enumerate() {
77        table[from as usize] = if i < set2.len() {
78            set2[i]
79        } else {
80            last.unwrap_or(from)
81        };
82    }
83    table
84}
85
86/// Build a 256-bit (32-byte) membership set for O(1) byte lookup.
87#[inline]
88fn build_member_set(chars: &[u8]) -> [u8; 32] {
89    let mut set = [0u8; 32];
90    for &ch in chars {
91        set[ch as usize >> 3] |= 1 << (ch & 7);
92    }
93    set
94}
95
96#[inline(always)]
97fn is_member(set: &[u8; 32], ch: u8) -> bool {
98    unsafe { (*set.get_unchecked(ch as usize >> 3) & (1 << (ch & 7))) != 0 }
99}
100
101/// Translate bytes in-place using a 256-byte lookup table.
102/// Uses 8x unrolling for better instruction-level parallelism.
103#[inline(always)]
104fn translate_inplace(data: &mut [u8], table: &[u8; 256]) {
105    let len = data.len();
106    let ptr = data.as_mut_ptr();
107    let mut i = 0;
108    unsafe {
109        while i + 8 <= len {
110            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
111            *ptr.add(i + 1) = *table.get_unchecked(*ptr.add(i + 1) as usize);
112            *ptr.add(i + 2) = *table.get_unchecked(*ptr.add(i + 2) as usize);
113            *ptr.add(i + 3) = *table.get_unchecked(*ptr.add(i + 3) as usize);
114            *ptr.add(i + 4) = *table.get_unchecked(*ptr.add(i + 4) as usize);
115            *ptr.add(i + 5) = *table.get_unchecked(*ptr.add(i + 5) as usize);
116            *ptr.add(i + 6) = *table.get_unchecked(*ptr.add(i + 6) as usize);
117            *ptr.add(i + 7) = *table.get_unchecked(*ptr.add(i + 7) as usize);
118            i += 8;
119        }
120        while i < len {
121            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
122            i += 1;
123        }
124    }
125}
126
127/// Translate bytes from source to destination using a 256-byte lookup table.
128#[inline(always)]
129fn translate_to(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
130    debug_assert!(dst.len() >= src.len());
131    unsafe {
132        let sp = src.as_ptr();
133        let dp = dst.as_mut_ptr();
134        let len = src.len();
135        let mut i = 0;
136        while i + 8 <= len {
137            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
138            *dp.add(i + 1) = *table.get_unchecked(*sp.add(i + 1) as usize);
139            *dp.add(i + 2) = *table.get_unchecked(*sp.add(i + 2) as usize);
140            *dp.add(i + 3) = *table.get_unchecked(*sp.add(i + 3) as usize);
141            *dp.add(i + 4) = *table.get_unchecked(*sp.add(i + 4) as usize);
142            *dp.add(i + 5) = *table.get_unchecked(*sp.add(i + 5) as usize);
143            *dp.add(i + 6) = *table.get_unchecked(*sp.add(i + 6) as usize);
144            *dp.add(i + 7) = *table.get_unchecked(*sp.add(i + 7) as usize);
145            i += 8;
146        }
147        while i < len {
148            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
149            i += 1;
150        }
151    }
152}
153
154// ============================================================================
155// SIMD range translation (x86_64)
156// ============================================================================
157
158/// Detect if the translate table is a single contiguous range with constant offset.
159/// Returns Some((lo, hi, offset)) if all non-identity entries form [lo..=hi] with
160/// table[i] = i + offset for all i in [lo, hi].
161#[inline]
162fn detect_range_offset(table: &[u8; 256]) -> Option<(u8, u8, i8)> {
163    let mut lo: Option<u8> = None;
164    let mut hi = 0u8;
165    let mut offset = 0i16;
166
167    for i in 0..256 {
168        if table[i] != i as u8 {
169            let diff = table[i] as i16 - i as i16;
170            match lo {
171                None => {
172                    lo = Some(i as u8);
173                    hi = i as u8;
174                    offset = diff;
175                }
176                Some(_) => {
177                    if diff != offset || i as u8 != hi.wrapping_add(1) {
178                        return None;
179                    }
180                    hi = i as u8;
181                }
182            }
183        }
184    }
185
186    lo.map(|l| (l, hi, offset as i8))
187}
188
189/// SIMD-accelerated range translation for mmap'd data.
190/// For tables where only a contiguous range [lo..=hi] is translated by a constant offset,
191/// uses AVX2 (32 bytes/iter) or SSE2 (16 bytes/iter) vectorized arithmetic.
192#[cfg(target_arch = "x86_64")]
193fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
194    if is_x86_feature_detected!("avx2") {
195        unsafe { translate_range_avx2(src, dst, lo, hi, offset) };
196    } else {
197        unsafe { translate_range_sse2(src, dst, lo, hi, offset) };
198    }
199}
200
201#[cfg(target_arch = "x86_64")]
202#[target_feature(enable = "avx2")]
203unsafe fn translate_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
204    use std::arch::x86_64::*;
205
206    unsafe {
207        let range = hi - lo;
208        // Bias: shift range so lo maps to -128 (signed min).
209        // For input in [lo, hi]: biased = input + (0x80 - lo) is in [-128, -128+range].
210        // For input < lo: biased wraps to large positive (signed), > threshold.
211        // For input > hi: biased > -128+range, > threshold.
212        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
213        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
214        let offset_v = _mm256_set1_epi8(offset);
215        let zero = _mm256_setzero_si256();
216
217        let len = src.len();
218        let mut i = 0;
219
220        while i + 32 <= len {
221            let input = _mm256_loadu_si256(src.as_ptr().add(i) as *const _);
222            let biased = _mm256_add_epi8(input, bias_v);
223            // gt = 0xFF where biased > threshold (OUT of range)
224            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
225            // mask = 0xFF where IN range (NOT gt)
226            let mask = _mm256_cmpeq_epi8(gt, zero);
227            let offset_masked = _mm256_and_si256(mask, offset_v);
228            let result = _mm256_add_epi8(input, offset_masked);
229            _mm256_storeu_si256(dst.as_mut_ptr().add(i) as *mut _, result);
230            i += 32;
231        }
232
233        // SSE2 tail for 16-byte remainder
234        if i + 16 <= len {
235            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
236            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
237            let offset_v128 = _mm_set1_epi8(offset);
238            let zero128 = _mm_setzero_si128();
239
240            let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
241            let biased = _mm_add_epi8(input, bias_v128);
242            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
243            let mask = _mm_cmpeq_epi8(gt, zero128);
244            let offset_masked = _mm_and_si128(mask, offset_v128);
245            let result = _mm_add_epi8(input, offset_masked);
246            _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
247            i += 16;
248        }
249
250        // Scalar tail
251        while i < len {
252            let b = *src.get_unchecked(i);
253            *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
254                b.wrapping_add(offset as u8)
255            } else {
256                b
257            };
258            i += 1;
259        }
260    }
261}
262
263#[cfg(target_arch = "x86_64")]
264#[target_feature(enable = "sse2")]
265unsafe fn translate_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
266    use std::arch::x86_64::*;
267
268    unsafe {
269        let range = hi - lo;
270        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
271        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
272        let offset_v = _mm_set1_epi8(offset);
273        let zero = _mm_setzero_si128();
274
275        let len = src.len();
276        let mut i = 0;
277
278        while i + 16 <= len {
279            let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
280            let biased = _mm_add_epi8(input, bias_v);
281            let gt = _mm_cmpgt_epi8(biased, threshold_v);
282            let mask = _mm_cmpeq_epi8(gt, zero);
283            let offset_masked = _mm_and_si128(mask, offset_v);
284            let result = _mm_add_epi8(input, offset_masked);
285            _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
286            i += 16;
287        }
288
289        while i < len {
290            let b = *src.get_unchecked(i);
291            *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
292                b.wrapping_add(offset as u8)
293            } else {
294                b
295            };
296            i += 1;
297        }
298    }
299}
300
301/// Scalar range translation fallback for non-x86_64.
302#[cfg(not(target_arch = "x86_64"))]
303fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
304    for (i, &b) in src.iter().enumerate() {
305        dst[i] = if b >= lo && b <= hi {
306            b.wrapping_add(offset as u8)
307        } else {
308            b
309        };
310    }
311}
312
313// ============================================================================
314// In-place SIMD range translation (saves one buffer allocation in streaming)
315// ============================================================================
316
317/// In-place SIMD-accelerated range translation.
318/// Translates bytes in [lo..=hi] by adding `offset`, leaving others unchanged.
319/// Operates on the buffer in-place, eliminating the need for a separate output buffer.
320#[cfg(target_arch = "x86_64")]
321fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
322    if is_x86_feature_detected!("avx2") {
323        unsafe { translate_range_avx2_inplace(data, lo, hi, offset) };
324    } else {
325        unsafe { translate_range_sse2_inplace(data, lo, hi, offset) };
326    }
327}
328
329#[cfg(target_arch = "x86_64")]
330#[target_feature(enable = "avx2")]
331unsafe fn translate_range_avx2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
332    use std::arch::x86_64::*;
333
334    unsafe {
335        let range = hi - lo;
336        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
337        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
338        let offset_v = _mm256_set1_epi8(offset);
339        let zero = _mm256_setzero_si256();
340
341        let len = data.len();
342        let ptr = data.as_mut_ptr();
343        let mut i = 0;
344
345        while i + 32 <= len {
346            let input = _mm256_loadu_si256(ptr.add(i) as *const _);
347            let biased = _mm256_add_epi8(input, bias_v);
348            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
349            let mask = _mm256_cmpeq_epi8(gt, zero);
350            let offset_masked = _mm256_and_si256(mask, offset_v);
351            let result = _mm256_add_epi8(input, offset_masked);
352            _mm256_storeu_si256(ptr.add(i) as *mut _, result);
353            i += 32;
354        }
355
356        if i + 16 <= len {
357            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
358            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
359            let offset_v128 = _mm_set1_epi8(offset);
360            let zero128 = _mm_setzero_si128();
361
362            let input = _mm_loadu_si128(ptr.add(i) as *const _);
363            let biased = _mm_add_epi8(input, bias_v128);
364            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
365            let mask = _mm_cmpeq_epi8(gt, zero128);
366            let offset_masked = _mm_and_si128(mask, offset_v128);
367            let result = _mm_add_epi8(input, offset_masked);
368            _mm_storeu_si128(ptr.add(i) as *mut _, result);
369            i += 16;
370        }
371
372        while i < len {
373            let b = *ptr.add(i);
374            *ptr.add(i) = if b >= lo && b <= hi {
375                b.wrapping_add(offset as u8)
376            } else {
377                b
378            };
379            i += 1;
380        }
381    }
382}
383
384#[cfg(target_arch = "x86_64")]
385#[target_feature(enable = "sse2")]
386unsafe fn translate_range_sse2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
387    use std::arch::x86_64::*;
388
389    unsafe {
390        let range = hi - lo;
391        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
392        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
393        let offset_v = _mm_set1_epi8(offset);
394        let zero = _mm_setzero_si128();
395
396        let len = data.len();
397        let ptr = data.as_mut_ptr();
398        let mut i = 0;
399
400        while i + 16 <= len {
401            let input = _mm_loadu_si128(ptr.add(i) as *const _);
402            let biased = _mm_add_epi8(input, bias_v);
403            let gt = _mm_cmpgt_epi8(biased, threshold_v);
404            let mask = _mm_cmpeq_epi8(gt, zero);
405            let offset_masked = _mm_and_si128(mask, offset_v);
406            let result = _mm_add_epi8(input, offset_masked);
407            _mm_storeu_si128(ptr.add(i) as *mut _, result);
408            i += 16;
409        }
410
411        while i < len {
412            let b = *ptr.add(i);
413            *ptr.add(i) = if b >= lo && b <= hi {
414                b.wrapping_add(offset as u8)
415            } else {
416                b
417            };
418            i += 1;
419        }
420    }
421}
422
423#[cfg(not(target_arch = "x86_64"))]
424fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
425    for b in data.iter_mut() {
426        if *b >= lo && *b <= hi {
427            *b = b.wrapping_add(offset as u8);
428        }
429    }
430}
431
432// ============================================================================
433// SIMD range deletion (x86_64)
434// ============================================================================
435
436/// Detect if ALL delete characters form a single contiguous byte range [lo..=hi].
437/// Returns Some((lo, hi)) if so. This is true for common classes:
438/// - `[:digit:]` = 0x30..=0x39
439/// - `a-z` = 0x61..=0x7A
440/// - `A-Z` = 0x41..=0x5A
441#[inline]
442fn detect_delete_range(chars: &[u8]) -> Option<(u8, u8)> {
443    if chars.is_empty() {
444        return None;
445    }
446    let mut lo = chars[0];
447    let mut hi = chars[0];
448    for &c in &chars[1..] {
449        if c < lo {
450            lo = c;
451        }
452        if c > hi {
453            hi = c;
454        }
455    }
456    // Check that the range size matches the number of chars (no gaps)
457    if (hi - lo + 1) as usize == chars.len() {
458        Some((lo, hi))
459    } else {
460        None
461    }
462}
463
464/// SIMD-accelerated delete for contiguous byte ranges.
465/// Uses the same bias+threshold trick as range translate to identify bytes in [lo..=hi],
466/// then compacts output by skipping matched bytes.
467#[cfg(target_arch = "x86_64")]
468fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
469    if is_x86_feature_detected!("avx2") {
470        unsafe { delete_range_avx2(src, dst, lo, hi) }
471    } else {
472        unsafe { delete_range_sse2(src, dst, lo, hi) }
473    }
474}
475
476#[cfg(target_arch = "x86_64")]
477#[target_feature(enable = "avx2")]
478unsafe fn delete_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
479    use std::arch::x86_64::*;
480
481    unsafe {
482        let range = hi - lo;
483        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
484        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
485        let zero = _mm256_setzero_si256();
486
487        let len = src.len();
488        let sp = src.as_ptr();
489        let dp = dst.as_mut_ptr();
490        let mut ri = 0;
491        let mut wp = 0;
492
493        while ri + 32 <= len {
494            let input = _mm256_loadu_si256(sp.add(ri) as *const _);
495            let biased = _mm256_add_epi8(input, bias_v);
496            // gt = 0xFF where biased > threshold (OUT of range = KEEP)
497            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
498            // in_range = 0xFF where IN range (to DELETE), 0 where to KEEP
499            let in_range = _mm256_cmpeq_epi8(gt, zero);
500            // keep_mask bits: 1 = keep (NOT in range)
501            let keep_mask = !(_mm256_movemask_epi8(in_range) as u32);
502
503            // Compact: copy kept bytes using the mask
504            let mut mask = keep_mask;
505            while mask != 0 {
506                let bit = mask.trailing_zeros() as usize;
507                *dp.add(wp) = *sp.add(ri + bit);
508                wp += 1;
509                mask &= mask - 1; // clear lowest set bit
510            }
511            ri += 32;
512        }
513
514        // SSE2 tail for 16-byte remainder
515        if ri + 16 <= len {
516            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
517            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
518            let zero128 = _mm_setzero_si128();
519
520            let input = _mm_loadu_si128(sp.add(ri) as *const _);
521            let biased = _mm_add_epi8(input, bias_v128);
522            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
523            let in_range = _mm_cmpeq_epi8(gt, zero128);
524            let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
525
526            let mut mask = keep_mask;
527            while mask != 0 {
528                let bit = mask.trailing_zeros() as usize;
529                *dp.add(wp) = *sp.add(ri + bit);
530                wp += 1;
531                mask &= mask - 1;
532            }
533            ri += 16;
534        }
535
536        // Scalar tail
537        while ri < len {
538            let b = *sp.add(ri);
539            if b < lo || b > hi {
540                *dp.add(wp) = b;
541                wp += 1;
542            }
543            ri += 1;
544        }
545
546        wp
547    }
548}
549
550#[cfg(target_arch = "x86_64")]
551#[target_feature(enable = "sse2")]
552unsafe fn delete_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
553    use std::arch::x86_64::*;
554
555    unsafe {
556        let range = hi - lo;
557        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
558        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
559        let zero = _mm_setzero_si128();
560
561        let len = src.len();
562        let sp = src.as_ptr();
563        let dp = dst.as_mut_ptr();
564        let mut ri = 0;
565        let mut wp = 0;
566
567        while ri + 16 <= len {
568            let input = _mm_loadu_si128(sp.add(ri) as *const _);
569            let biased = _mm_add_epi8(input, bias_v);
570            let gt = _mm_cmpgt_epi8(biased, threshold_v);
571            let in_range = _mm_cmpeq_epi8(gt, zero);
572            let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
573
574            let mut mask = keep_mask;
575            while mask != 0 {
576                let bit = mask.trailing_zeros() as usize;
577                *dp.add(wp) = *sp.add(ri + bit);
578                wp += 1;
579                mask &= mask - 1;
580            }
581            ri += 16;
582        }
583
584        while ri < len {
585            let b = *sp.add(ri);
586            if b < lo || b > hi {
587                *dp.add(wp) = b;
588                wp += 1;
589            }
590            ri += 1;
591        }
592
593        wp
594    }
595}
596
597/// Scalar range delete fallback for non-x86_64.
598#[cfg(not(target_arch = "x86_64"))]
599fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
600    let mut wp = 0;
601    for &b in src {
602        if b < lo || b > hi {
603            dst[wp] = b;
604            wp += 1;
605        }
606    }
607    wp
608}
609
610/// Streaming delete for contiguous byte ranges using SIMD range detection.
611fn delete_range_streaming(
612    lo: u8,
613    hi: u8,
614    reader: &mut impl Read,
615    writer: &mut impl Write,
616) -> io::Result<()> {
617    let mut src = vec![0u8; STREAM_BUF];
618    let mut dst = alloc_uninit_vec(STREAM_BUF);
619    loop {
620        let n = read_full(reader, &mut src)?;
621        if n == 0 {
622            break;
623        }
624        let wp = delete_range_chunk(&src[..n], &mut dst, lo, hi);
625        if wp > 0 {
626            writer.write_all(&dst[..wp])?;
627        }
628    }
629    Ok(())
630}
631
632// ============================================================================
633// Streaming functions (Read + Write)
634// ============================================================================
635
636pub fn translate(
637    set1: &[u8],
638    set2: &[u8],
639    reader: &mut impl Read,
640    writer: &mut impl Write,
641) -> io::Result<()> {
642    let table = build_translate_table(set1, set2);
643
644    // Check for identity table — pure passthrough (no transformation needed)
645    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
646    if is_identity {
647        return passthrough_stream(reader, writer);
648    }
649
650    // Try SIMD fast path for range translations (in-place, single buffer)
651    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
652        return translate_range_stream(lo, hi, offset, reader, writer);
653    }
654
655    // General case: IN-PLACE translation on a SINGLE 4MB buffer.
656    // This halves memory bandwidth vs the old separate src/dst approach:
657    // - Old: read into src (4MB), translate from src→dst (read 4MB + write 4MB), write dst (4MB) = 12MB
658    // - New: read into buf (4MB), translate in-place (read+write 4MB), write buf (4MB) = 8MB
659    // The 8x-unrolled in-place translate avoids store-to-load forwarding stalls
660    // because consecutive reads are 8 bytes apart (sequential), not aliased.
661    // Using 4MB buffer (vs 1MB) reduces syscall count from 10 to 3 for 10MB.
662    let mut buf = vec![0u8; TRANSLATE_STREAM_BUF];
663    loop {
664        let n = read_full(reader, &mut buf)?;
665        if n == 0 {
666            break;
667        }
668        translate_inplace(&mut buf[..n], &table);
669        writer.write_all(&buf[..n])?;
670    }
671    Ok(())
672}
673
674/// Streaming SIMD range translation — single buffer, in-place transform.
675/// Uses 4MB buffer for fewer syscalls (translate is compute-light).
676fn translate_range_stream(
677    lo: u8,
678    hi: u8,
679    offset: i8,
680    reader: &mut impl Read,
681    writer: &mut impl Write,
682) -> io::Result<()> {
683    let mut buf = vec![0u8; TRANSLATE_STREAM_BUF];
684    loop {
685        let n = read_full(reader, &mut buf)?;
686        if n == 0 {
687            break;
688        }
689        translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
690        writer.write_all(&buf[..n])?;
691    }
692    Ok(())
693}
694
695/// Pure passthrough: copy stdin to stdout without transformation.
696/// Uses a single 4MB buffer with direct read/write, no processing overhead.
697fn passthrough_stream(reader: &mut impl Read, writer: &mut impl Write) -> io::Result<()> {
698    let mut buf = vec![0u8; TRANSLATE_STREAM_BUF];
699    loop {
700        let n = read_full(reader, &mut buf)?;
701        if n == 0 {
702            break;
703        }
704        writer.write_all(&buf[..n])?;
705    }
706    Ok(())
707}
708
709/// Read as many bytes as possible into buf, retrying on partial reads.
710#[inline]
711fn read_full(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
712    let mut total = 0;
713    while total < buf.len() {
714        match reader.read(&mut buf[total..]) {
715            Ok(0) => break,
716            Ok(n) => total += n,
717            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
718            Err(e) => return Err(e),
719        }
720    }
721    Ok(total)
722}
723
724pub fn translate_squeeze(
725    set1: &[u8],
726    set2: &[u8],
727    reader: &mut impl Read,
728    writer: &mut impl Write,
729) -> io::Result<()> {
730    let table = build_translate_table(set1, set2);
731    let squeeze_set = build_member_set(set2);
732
733    // Two-pass optimization for range translations:
734    // Pass 1: SIMD range translate in-place (10x faster than scalar table lookup)
735    // Pass 2: scalar squeeze (inherently sequential due to state dependency)
736    // Even though it's two passes, the translate pass is so much faster with SIMD
737    // that the total is still a net win.
738    let range_info = detect_range_offset(&table);
739
740    let mut buf = vec![0u8; STREAM_BUF];
741    let mut last_squeezed: u16 = 256;
742
743    loop {
744        let n = read_full(reader, &mut buf)?;
745        if n == 0 {
746            break;
747        }
748        // Pass 1: translate
749        if let Some((lo, hi, offset)) = range_info {
750            translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
751        } else {
752            translate_inplace(&mut buf[..n], &table);
753        }
754        // Pass 2: squeeze in-place
755        let mut wp = 0;
756        unsafe {
757            let ptr = buf.as_mut_ptr();
758            for i in 0..n {
759                let b = *ptr.add(i);
760                if is_member(&squeeze_set, b) {
761                    if last_squeezed == b as u16 {
762                        continue;
763                    }
764                    last_squeezed = b as u16;
765                } else {
766                    last_squeezed = 256;
767                }
768                *ptr.add(wp) = b;
769                wp += 1;
770            }
771        }
772        writer.write_all(&buf[..wp])?;
773    }
774    Ok(())
775}
776
777pub fn delete(
778    delete_chars: &[u8],
779    reader: &mut impl Read,
780    writer: &mut impl Write,
781) -> io::Result<()> {
782    if delete_chars.len() == 1 {
783        return delete_single_streaming(delete_chars[0], reader, writer);
784    }
785    if delete_chars.len() <= 3 {
786        return delete_multi_streaming(delete_chars, reader, writer);
787    }
788
789    // SIMD fast path: if all delete chars form a contiguous range [lo..=hi],
790    // use vectorized range comparison instead of scalar bitset lookup.
791    // This covers [:digit:] (0x30-0x39), a-z, A-Z, etc.
792    if let Some((lo, hi)) = detect_delete_range(delete_chars) {
793        return delete_range_streaming(lo, hi, reader, writer);
794    }
795
796    let member = build_member_set(delete_chars);
797    let mut buf = vec![0u8; STREAM_BUF];
798
799    loop {
800        let n = read_full(reader, &mut buf)?;
801        if n == 0 {
802            break;
803        }
804        let mut wp = 0;
805        unsafe {
806            let ptr = buf.as_mut_ptr();
807            let mut i = 0;
808            while i + 8 <= n {
809                let b0 = *ptr.add(i);
810                let b1 = *ptr.add(i + 1);
811                let b2 = *ptr.add(i + 2);
812                let b3 = *ptr.add(i + 3);
813                let b4 = *ptr.add(i + 4);
814                let b5 = *ptr.add(i + 5);
815                let b6 = *ptr.add(i + 6);
816                let b7 = *ptr.add(i + 7);
817
818                if !is_member(&member, b0) {
819                    *ptr.add(wp) = b0;
820                    wp += 1;
821                }
822                if !is_member(&member, b1) {
823                    *ptr.add(wp) = b1;
824                    wp += 1;
825                }
826                if !is_member(&member, b2) {
827                    *ptr.add(wp) = b2;
828                    wp += 1;
829                }
830                if !is_member(&member, b3) {
831                    *ptr.add(wp) = b3;
832                    wp += 1;
833                }
834                if !is_member(&member, b4) {
835                    *ptr.add(wp) = b4;
836                    wp += 1;
837                }
838                if !is_member(&member, b5) {
839                    *ptr.add(wp) = b5;
840                    wp += 1;
841                }
842                if !is_member(&member, b6) {
843                    *ptr.add(wp) = b6;
844                    wp += 1;
845                }
846                if !is_member(&member, b7) {
847                    *ptr.add(wp) = b7;
848                    wp += 1;
849                }
850                i += 8;
851            }
852            while i < n {
853                let b = *ptr.add(i);
854                if !is_member(&member, b) {
855                    *ptr.add(wp) = b;
856                    wp += 1;
857                }
858                i += 1;
859            }
860        }
861        writer.write_all(&buf[..wp])?;
862    }
863    Ok(())
864}
865
866fn delete_single_streaming(
867    ch: u8,
868    reader: &mut impl Read,
869    writer: &mut impl Write,
870) -> io::Result<()> {
871    let mut buf = vec![0u8; STREAM_BUF];
872    loop {
873        let n = match reader.read(&mut buf) {
874            Ok(0) => break,
875            Ok(n) => n,
876            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
877            Err(e) => return Err(e),
878        };
879        let mut wp = 0;
880        let mut i = 0;
881        while i < n {
882            match memchr::memchr(ch, &buf[i..n]) {
883                Some(offset) => {
884                    if offset > 0 {
885                        if wp != i {
886                            unsafe {
887                                std::ptr::copy(
888                                    buf.as_ptr().add(i),
889                                    buf.as_mut_ptr().add(wp),
890                                    offset,
891                                );
892                            }
893                        }
894                        wp += offset;
895                    }
896                    i += offset + 1;
897                }
898                None => {
899                    let run_len = n - i;
900                    if run_len > 0 {
901                        if wp != i {
902                            unsafe {
903                                std::ptr::copy(
904                                    buf.as_ptr().add(i),
905                                    buf.as_mut_ptr().add(wp),
906                                    run_len,
907                                );
908                            }
909                        }
910                        wp += run_len;
911                    }
912                    break;
913                }
914            }
915        }
916        writer.write_all(&buf[..wp])?;
917    }
918    Ok(())
919}
920
921fn delete_multi_streaming(
922    chars: &[u8],
923    reader: &mut impl Read,
924    writer: &mut impl Write,
925) -> io::Result<()> {
926    let mut buf = vec![0u8; STREAM_BUF];
927    loop {
928        let n = match reader.read(&mut buf) {
929            Ok(0) => break,
930            Ok(n) => n,
931            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
932            Err(e) => return Err(e),
933        };
934        let mut wp = 0;
935        let mut i = 0;
936        while i < n {
937            let found = if chars.len() == 2 {
938                memchr::memchr2(chars[0], chars[1], &buf[i..n])
939            } else {
940                memchr::memchr3(chars[0], chars[1], chars[2], &buf[i..n])
941            };
942            match found {
943                Some(offset) => {
944                    if offset > 0 {
945                        if wp != i {
946                            unsafe {
947                                std::ptr::copy(
948                                    buf.as_ptr().add(i),
949                                    buf.as_mut_ptr().add(wp),
950                                    offset,
951                                );
952                            }
953                        }
954                        wp += offset;
955                    }
956                    i += offset + 1;
957                }
958                None => {
959                    let run_len = n - i;
960                    if run_len > 0 {
961                        if wp != i {
962                            unsafe {
963                                std::ptr::copy(
964                                    buf.as_ptr().add(i),
965                                    buf.as_mut_ptr().add(wp),
966                                    run_len,
967                                );
968                            }
969                        }
970                        wp += run_len;
971                    }
972                    break;
973                }
974            }
975        }
976        writer.write_all(&buf[..wp])?;
977    }
978    Ok(())
979}
980
981pub fn delete_squeeze(
982    delete_chars: &[u8],
983    squeeze_chars: &[u8],
984    reader: &mut impl Read,
985    writer: &mut impl Write,
986) -> io::Result<()> {
987    let delete_set = build_member_set(delete_chars);
988    let squeeze_set = build_member_set(squeeze_chars);
989    let mut buf = vec![0u8; STREAM_BUF];
990    let mut last_squeezed: u16 = 256;
991
992    loop {
993        let n = read_full(reader, &mut buf)?;
994        if n == 0 {
995            break;
996        }
997        let mut wp = 0;
998        unsafe {
999            let ptr = buf.as_mut_ptr();
1000            for i in 0..n {
1001                let b = *ptr.add(i);
1002                if is_member(&delete_set, b) {
1003                    continue;
1004                }
1005                if is_member(&squeeze_set, b) {
1006                    if last_squeezed == b as u16 {
1007                        continue;
1008                    }
1009                    last_squeezed = b as u16;
1010                } else {
1011                    last_squeezed = 256;
1012                }
1013                *ptr.add(wp) = b;
1014                wp += 1;
1015            }
1016        }
1017        writer.write_all(&buf[..wp])?;
1018    }
1019    Ok(())
1020}
1021
1022pub fn squeeze(
1023    squeeze_chars: &[u8],
1024    reader: &mut impl Read,
1025    writer: &mut impl Write,
1026) -> io::Result<()> {
1027    if squeeze_chars.len() == 1 {
1028        return squeeze_single_stream(squeeze_chars[0], reader, writer);
1029    }
1030
1031    let member = build_member_set(squeeze_chars);
1032    let mut buf = vec![0u8; STREAM_BUF];
1033    let mut last_squeezed: u16 = 256;
1034
1035    loop {
1036        let n = read_full(reader, &mut buf)?;
1037        if n == 0 {
1038            break;
1039        }
1040        let mut wp = 0;
1041        unsafe {
1042            let ptr = buf.as_mut_ptr();
1043            for i in 0..n {
1044                let b = *ptr.add(i);
1045                if is_member(&member, b) {
1046                    if last_squeezed == b as u16 {
1047                        continue;
1048                    }
1049                    last_squeezed = b as u16;
1050                } else {
1051                    last_squeezed = 256;
1052                }
1053                *ptr.add(wp) = b;
1054                wp += 1;
1055            }
1056        }
1057        writer.write_all(&buf[..wp])?;
1058    }
1059    Ok(())
1060}
1061
1062fn squeeze_single_stream(
1063    ch: u8,
1064    reader: &mut impl Read,
1065    writer: &mut impl Write,
1066) -> io::Result<()> {
1067    let mut buf = vec![0u8; STREAM_BUF];
1068    let mut was_squeeze_char = false;
1069
1070    loop {
1071        let n = match reader.read(&mut buf) {
1072            Ok(0) => break,
1073            Ok(n) => n,
1074            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
1075            Err(e) => return Err(e),
1076        };
1077
1078        let mut wp = 0;
1079        let mut i = 0;
1080
1081        while i < n {
1082            if was_squeeze_char && buf[i] == ch {
1083                i += 1;
1084                while i < n && buf[i] == ch {
1085                    i += 1;
1086                }
1087                if i >= n {
1088                    break;
1089                }
1090            }
1091
1092            match memchr::memchr(ch, &buf[i..n]) {
1093                Some(offset) => {
1094                    let run_len = offset;
1095                    if run_len > 0 {
1096                        if wp != i {
1097                            unsafe {
1098                                std::ptr::copy(
1099                                    buf.as_ptr().add(i),
1100                                    buf.as_mut_ptr().add(wp),
1101                                    run_len,
1102                                );
1103                            }
1104                        }
1105                        wp += run_len;
1106                    }
1107                    i += run_len;
1108
1109                    unsafe {
1110                        *buf.as_mut_ptr().add(wp) = ch;
1111                    }
1112                    wp += 1;
1113                    was_squeeze_char = true;
1114                    i += 1;
1115                    while i < n && buf[i] == ch {
1116                        i += 1;
1117                    }
1118                }
1119                None => {
1120                    let run_len = n - i;
1121                    if run_len > 0 {
1122                        if wp != i {
1123                            unsafe {
1124                                std::ptr::copy(
1125                                    buf.as_ptr().add(i),
1126                                    buf.as_mut_ptr().add(wp),
1127                                    run_len,
1128                                );
1129                            }
1130                        }
1131                        wp += run_len;
1132                    }
1133                    was_squeeze_char = false;
1134                    break;
1135                }
1136            }
1137        }
1138
1139        writer.write_all(&buf[..wp])?;
1140    }
1141    Ok(())
1142}
1143
1144// ============================================================================
1145// Mmap-based functions (zero-copy input from byte slice)
1146// ============================================================================
1147
1148/// Maximum data size for single-allocation translate approach.
1149/// Below this limit, translate ALL data into one buffer and do a single write_all.
1150/// Above this, use chunked approach to limit memory usage.
1151const SINGLE_WRITE_LIMIT: usize = 16 * 1024 * 1024;
1152
1153/// Translate bytes from an mmap'd byte slice.
1154/// Detects single-range translations (e.g., a-z to A-Z) and uses SIMD vectorized
1155/// arithmetic (AVX2: 32 bytes/iter, SSE2: 16 bytes/iter) for those cases.
1156/// Falls back to scalar 256-byte table lookup for general translations.
1157///
1158/// For data >= 2MB: uses rayon parallel processing across multiple cores.
1159/// For data <= 16MB: single allocation + single write_all (1 syscall).
1160/// For data > 16MB: chunked approach to limit memory (N syscalls where N = data/4MB).
1161pub fn translate_mmap(
1162    set1: &[u8],
1163    set2: &[u8],
1164    data: &[u8],
1165    writer: &mut impl Write,
1166) -> io::Result<()> {
1167    let table = build_translate_table(set1, set2);
1168
1169    // Check if table is identity — pure passthrough
1170    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
1171    if is_identity {
1172        return writer.write_all(data);
1173    }
1174
1175    // Try SIMD fast path for single-range constant-offset translations
1176    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
1177        return translate_mmap_range(data, writer, lo, hi, offset);
1178    }
1179
1180    // General case: table lookup (with parallel processing for large data)
1181    translate_mmap_table(data, writer, &table)
1182}
1183
1184/// SIMD range translate for mmap data, with rayon parallel processing.
1185fn translate_mmap_range(
1186    data: &[u8],
1187    writer: &mut impl Write,
1188    lo: u8,
1189    hi: u8,
1190    offset: i8,
1191) -> io::Result<()> {
1192    // Parallel path: split data into chunks, translate each in parallel
1193    if data.len() >= PARALLEL_THRESHOLD {
1194        let mut buf = alloc_uninit_vec(data.len());
1195        let n_threads = rayon::current_num_threads().max(1);
1196        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1197
1198        // Process chunks in parallel: each thread writes to its slice of buf
1199        data.par_chunks(chunk_size)
1200            .zip(buf.par_chunks_mut(chunk_size))
1201            .for_each(|(src_chunk, dst_chunk)| {
1202                translate_range_simd(src_chunk, &mut dst_chunk[..src_chunk.len()], lo, hi, offset);
1203            });
1204
1205        return writer.write_all(&buf);
1206    }
1207
1208    // Small data: single-threaded SIMD
1209    if data.len() <= SINGLE_WRITE_LIMIT {
1210        let mut buf = alloc_uninit_vec(data.len());
1211        translate_range_simd(data, &mut buf, lo, hi, offset);
1212        return writer.write_all(&buf);
1213    }
1214    // Chunked path for large data (shouldn't happen since PARALLEL_THRESHOLD < SINGLE_WRITE_LIMIT)
1215    let mut buf = alloc_uninit_vec(BUF_SIZE);
1216    for chunk in data.chunks(BUF_SIZE) {
1217        translate_range_simd(chunk, &mut buf[..chunk.len()], lo, hi, offset);
1218        writer.write_all(&buf[..chunk.len()])?;
1219    }
1220    Ok(())
1221}
1222
1223/// General table-lookup translate for mmap data, with rayon parallel processing.
1224fn translate_mmap_table(data: &[u8], writer: &mut impl Write, table: &[u8; 256]) -> io::Result<()> {
1225    // Parallel path: split data into chunks, translate each in parallel
1226    if data.len() >= PARALLEL_THRESHOLD {
1227        let mut buf = alloc_uninit_vec(data.len());
1228        let n_threads = rayon::current_num_threads().max(1);
1229        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1230
1231        data.par_chunks(chunk_size)
1232            .zip(buf.par_chunks_mut(chunk_size))
1233            .for_each(|(src_chunk, dst_chunk)| {
1234                translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], table);
1235            });
1236
1237        return writer.write_all(&buf);
1238    }
1239
1240    // Small data: single-threaded
1241    if data.len() <= SINGLE_WRITE_LIMIT {
1242        let mut buf = alloc_uninit_vec(data.len());
1243        translate_to(data, &mut buf, table);
1244        return writer.write_all(&buf);
1245    }
1246    let mut buf = alloc_uninit_vec(BUF_SIZE);
1247    for chunk in data.chunks(BUF_SIZE) {
1248        translate_to(chunk, &mut buf[..chunk.len()], table);
1249        writer.write_all(&buf[..chunk.len()])?;
1250    }
1251    Ok(())
1252}
1253
1254/// Translate + squeeze from mmap'd byte slice.
1255///
1256/// For data >= 2MB: two-phase approach: parallel translate, then sequential squeeze.
1257/// For data <= 16MB: single-pass translate+squeeze into one buffer, one write syscall.
1258/// For data > 16MB: chunked approach to limit memory.
1259pub fn translate_squeeze_mmap(
1260    set1: &[u8],
1261    set2: &[u8],
1262    data: &[u8],
1263    writer: &mut impl Write,
1264) -> io::Result<()> {
1265    let table = build_translate_table(set1, set2);
1266    let squeeze_set = build_member_set(set2);
1267
1268    // For large data: two-phase approach
1269    // Phase 1: parallel translate into buffer
1270    // Phase 2: sequential squeeze IN-PLACE on the translated buffer
1271    //          (squeeze only removes bytes, never grows, so no second allocation needed)
1272    if data.len() >= PARALLEL_THRESHOLD {
1273        // Phase 1: parallel translate
1274        let mut translated = alloc_uninit_vec(data.len());
1275        let range_info = detect_range_offset(&table);
1276        let n_threads = rayon::current_num_threads().max(1);
1277        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1278
1279        if let Some((lo, hi, offset)) = range_info {
1280            data.par_chunks(chunk_size)
1281                .zip(translated.par_chunks_mut(chunk_size))
1282                .for_each(|(src_chunk, dst_chunk)| {
1283                    translate_range_simd(
1284                        src_chunk,
1285                        &mut dst_chunk[..src_chunk.len()],
1286                        lo,
1287                        hi,
1288                        offset,
1289                    );
1290                });
1291        } else {
1292            data.par_chunks(chunk_size)
1293                .zip(translated.par_chunks_mut(chunk_size))
1294                .for_each(|(src_chunk, dst_chunk)| {
1295                    translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], &table);
1296                });
1297        }
1298
1299        // Phase 2: squeeze in-place on the translated buffer.
1300        // Since squeeze only removes bytes (never grows), we can read ahead and
1301        // compact into the same buffer, saving a full data.len() heap allocation.
1302        let mut last_squeezed: u16 = 256;
1303        let len = translated.len();
1304        let mut wp = 0;
1305        unsafe {
1306            let ptr = translated.as_mut_ptr();
1307            let mut i = 0;
1308            while i < len {
1309                let b = *ptr.add(i);
1310                if is_member(&squeeze_set, b) {
1311                    if last_squeezed == b as u16 {
1312                        i += 1;
1313                        continue;
1314                    }
1315                    last_squeezed = b as u16;
1316                } else {
1317                    last_squeezed = 256;
1318                }
1319                *ptr.add(wp) = b;
1320                wp += 1;
1321                i += 1;
1322            }
1323        }
1324        return writer.write_all(&translated[..wp]);
1325    }
1326
1327    // Single-write fast path: translate+squeeze all data in one pass, one write
1328    if data.len() <= SINGLE_WRITE_LIMIT {
1329        let mut buf: Vec<u8> = Vec::with_capacity(data.len());
1330        let mut last_squeezed: u16 = 256;
1331        unsafe {
1332            buf.set_len(data.len());
1333            let outp: *mut u8 = buf.as_mut_ptr();
1334            let inp = data.as_ptr();
1335            let len = data.len();
1336            let mut wp = 0;
1337            let mut i = 0;
1338            while i < len {
1339                let translated = *table.get_unchecked(*inp.add(i) as usize);
1340                if is_member(&squeeze_set, translated) {
1341                    if last_squeezed == translated as u16 {
1342                        i += 1;
1343                        continue;
1344                    }
1345                    last_squeezed = translated as u16;
1346                } else {
1347                    last_squeezed = 256;
1348                }
1349                *outp.add(wp) = translated;
1350                wp += 1;
1351                i += 1;
1352            }
1353            buf.set_len(wp);
1354        }
1355        return writer.write_all(&buf);
1356    }
1357
1358    // Chunked path for large data
1359    let buf_size = data.len().min(BUF_SIZE);
1360    let mut buf = vec![0u8; buf_size];
1361    let mut last_squeezed: u16 = 256;
1362
1363    for chunk in data.chunks(buf_size) {
1364        translate_to(chunk, &mut buf[..chunk.len()], &table);
1365        let mut wp = 0;
1366        unsafe {
1367            let ptr = buf.as_mut_ptr();
1368            for i in 0..chunk.len() {
1369                let b = *ptr.add(i);
1370                if is_member(&squeeze_set, b) {
1371                    if last_squeezed == b as u16 {
1372                        continue;
1373                    }
1374                    last_squeezed = b as u16;
1375                } else {
1376                    last_squeezed = 256;
1377                }
1378                *ptr.add(wp) = b;
1379                wp += 1;
1380            }
1381        }
1382        writer.write_all(&buf[..wp])?;
1383    }
1384    Ok(())
1385}
1386
1387/// Delete from mmap'd byte slice.
1388///
1389/// For data >= 2MB: uses rayon parallel processing across multiple cores.
1390/// For data <= 16MB: delete into one buffer, one write syscall.
1391/// For data > 16MB: chunked approach to limit memory.
1392pub fn delete_mmap(delete_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1393    if delete_chars.len() == 1 {
1394        return delete_single_char_mmap(delete_chars[0], data, writer);
1395    }
1396    if delete_chars.len() <= 3 {
1397        return delete_multi_memchr_mmap(delete_chars, data, writer);
1398    }
1399
1400    // SIMD fast path for contiguous ranges (digits, a-z, A-Z, etc.)
1401    if let Some((lo, hi)) = detect_delete_range(delete_chars) {
1402        return delete_range_mmap(data, writer, lo, hi);
1403    }
1404
1405    let member = build_member_set(delete_chars);
1406
1407    // Parallel path: pre-allocate a single output buffer of data.len() and have each
1408    // thread write to its non-overlapping slice, then do a single write_all.
1409    // This avoids per-chunk Vec allocations that the old approach had.
1410    if data.len() >= PARALLEL_THRESHOLD {
1411        let n_threads = rayon::current_num_threads().max(1);
1412        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1413
1414        // Each thread deletes into its slice of outbuf and returns bytes written.
1415        let mut outbuf = alloc_uninit_vec(data.len());
1416        let chunk_lens: Vec<usize> = data
1417            .par_chunks(chunk_size)
1418            .zip(outbuf.par_chunks_mut(chunk_size))
1419            .map(|(src_chunk, dst_chunk)| delete_chunk_bitset_into(src_chunk, &member, dst_chunk))
1420            .collect();
1421
1422        // Compact: move each chunk's output to be contiguous.
1423        // chunk_lens[i] is how many bytes thread i wrote into its slice.
1424        // We need to shift them together since each dst_chunk started at chunk_size offsets.
1425        let mut write_pos = 0;
1426        let mut src_offset = 0;
1427        for &clen in &chunk_lens {
1428            if clen > 0 && src_offset != write_pos {
1429                unsafe {
1430                    std::ptr::copy(
1431                        outbuf.as_ptr().add(src_offset),
1432                        outbuf.as_mut_ptr().add(write_pos),
1433                        clen,
1434                    );
1435                }
1436            }
1437            write_pos += clen;
1438            src_offset += chunk_size;
1439        }
1440
1441        return writer.write_all(&outbuf[..write_pos]);
1442    }
1443
1444    // Single-write fast path: delete into one buffer, one write
1445    if data.len() <= SINGLE_WRITE_LIMIT {
1446        let mut outbuf = alloc_uninit_vec(data.len());
1447        let out_pos = delete_chunk_bitset_into(data, &member, &mut outbuf);
1448        return writer.write_all(&outbuf[..out_pos]);
1449    }
1450
1451    // Chunked path for large data
1452    let buf_size = data.len().min(BUF_SIZE);
1453    let mut outbuf = alloc_uninit_vec(buf_size);
1454
1455    for chunk in data.chunks(buf_size) {
1456        let out_pos = delete_chunk_bitset_into(chunk, &member, &mut outbuf);
1457        writer.write_all(&outbuf[..out_pos])?;
1458    }
1459    Ok(())
1460}
1461
1462/// SIMD range delete for mmap data, with rayon parallel processing.
1463fn delete_range_mmap(data: &[u8], writer: &mut impl Write, lo: u8, hi: u8) -> io::Result<()> {
1464    // Parallel path: each thread deletes from its chunk into a local Vec
1465    if data.len() >= PARALLEL_THRESHOLD {
1466        let n_threads = rayon::current_num_threads().max(1);
1467        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1468
1469        let results: Vec<Vec<u8>> = data
1470            .par_chunks(chunk_size)
1471            .map(|chunk| {
1472                let mut out = alloc_uninit_vec(chunk.len());
1473                let wp = delete_range_chunk(chunk, &mut out, lo, hi);
1474                unsafe { out.set_len(wp) };
1475                out
1476            })
1477            .collect();
1478
1479        let slices: Vec<std::io::IoSlice> = results
1480            .iter()
1481            .filter(|r| !r.is_empty())
1482            .map(|r| std::io::IoSlice::new(r))
1483            .collect();
1484        return write_ioslices(writer, &slices);
1485    }
1486
1487    // Single-write fast path
1488    if data.len() <= SINGLE_WRITE_LIMIT {
1489        let mut outbuf = alloc_uninit_vec(data.len());
1490        let wp = delete_range_chunk(data, &mut outbuf, lo, hi);
1491        return writer.write_all(&outbuf[..wp]);
1492    }
1493
1494    // Chunked path
1495    let mut outbuf = alloc_uninit_vec(BUF_SIZE);
1496    for chunk in data.chunks(BUF_SIZE) {
1497        let wp = delete_range_chunk(chunk, &mut outbuf, lo, hi);
1498        writer.write_all(&outbuf[..wp])?;
1499    }
1500    Ok(())
1501}
1502
1503/// Delete bytes from chunk using bitset, writing into pre-allocated buffer.
1504/// Returns number of bytes written.
1505#[inline]
1506fn delete_chunk_bitset_into(chunk: &[u8], member: &[u8; 32], outbuf: &mut [u8]) -> usize {
1507    let len = chunk.len();
1508    let mut out_pos = 0;
1509    let mut i = 0;
1510
1511    while i + 8 <= len {
1512        unsafe {
1513            let b0 = *chunk.get_unchecked(i);
1514            let b1 = *chunk.get_unchecked(i + 1);
1515            let b2 = *chunk.get_unchecked(i + 2);
1516            let b3 = *chunk.get_unchecked(i + 3);
1517            let b4 = *chunk.get_unchecked(i + 4);
1518            let b5 = *chunk.get_unchecked(i + 5);
1519            let b6 = *chunk.get_unchecked(i + 6);
1520            let b7 = *chunk.get_unchecked(i + 7);
1521
1522            *outbuf.get_unchecked_mut(out_pos) = b0;
1523            out_pos += !is_member(member, b0) as usize;
1524            *outbuf.get_unchecked_mut(out_pos) = b1;
1525            out_pos += !is_member(member, b1) as usize;
1526            *outbuf.get_unchecked_mut(out_pos) = b2;
1527            out_pos += !is_member(member, b2) as usize;
1528            *outbuf.get_unchecked_mut(out_pos) = b3;
1529            out_pos += !is_member(member, b3) as usize;
1530            *outbuf.get_unchecked_mut(out_pos) = b4;
1531            out_pos += !is_member(member, b4) as usize;
1532            *outbuf.get_unchecked_mut(out_pos) = b5;
1533            out_pos += !is_member(member, b5) as usize;
1534            *outbuf.get_unchecked_mut(out_pos) = b6;
1535            out_pos += !is_member(member, b6) as usize;
1536            *outbuf.get_unchecked_mut(out_pos) = b7;
1537            out_pos += !is_member(member, b7) as usize;
1538        }
1539        i += 8;
1540    }
1541
1542    while i < len {
1543        unsafe {
1544            let b = *chunk.get_unchecked(i);
1545            *outbuf.get_unchecked_mut(out_pos) = b;
1546            out_pos += !is_member(member, b) as usize;
1547        }
1548        i += 1;
1549    }
1550
1551    out_pos
1552}
1553
1554fn delete_single_char_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1555    // Parallel path for large data: each thread deletes from its chunk,
1556    // then use writev to write all results in one syscall batch.
1557    if data.len() >= PARALLEL_THRESHOLD {
1558        let n_threads = rayon::current_num_threads().max(1);
1559        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1560
1561        let results: Vec<Vec<u8>> = data
1562            .par_chunks(chunk_size)
1563            .map(|chunk| {
1564                let mut out = Vec::with_capacity(chunk.len());
1565                let mut last = 0;
1566                for pos in memchr::memchr_iter(ch, chunk) {
1567                    if pos > last {
1568                        out.extend_from_slice(&chunk[last..pos]);
1569                    }
1570                    last = pos + 1;
1571                }
1572                if last < chunk.len() {
1573                    out.extend_from_slice(&chunk[last..]);
1574                }
1575                out
1576            })
1577            .collect();
1578
1579        // Use writev to batch all results into fewer syscalls
1580        let slices: Vec<std::io::IoSlice> = results
1581            .iter()
1582            .filter(|r| !r.is_empty())
1583            .map(|r| std::io::IoSlice::new(r))
1584            .collect();
1585        return write_ioslices(writer, &slices);
1586    }
1587
1588    // Single-write fast path: collect all non-deleted spans into one buffer
1589    if data.len() <= SINGLE_WRITE_LIMIT {
1590        let mut outbuf = Vec::with_capacity(data.len());
1591        let mut last = 0;
1592        for pos in memchr::memchr_iter(ch, data) {
1593            if pos > last {
1594                outbuf.extend_from_slice(&data[last..pos]);
1595            }
1596            last = pos + 1;
1597        }
1598        if last < data.len() {
1599            outbuf.extend_from_slice(&data[last..]);
1600        }
1601        return writer.write_all(&outbuf);
1602    }
1603
1604    // Chunked path for large data
1605    let buf_size = data.len().min(BUF_SIZE);
1606    let mut outbuf = vec![0u8; buf_size];
1607
1608    for chunk in data.chunks(buf_size) {
1609        let mut wp = 0;
1610        let mut last = 0;
1611        for pos in memchr::memchr_iter(ch, chunk) {
1612            if pos > last {
1613                let run = pos - last;
1614                outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
1615                wp += run;
1616            }
1617            last = pos + 1;
1618        }
1619        if last < chunk.len() {
1620            let run = chunk.len() - last;
1621            outbuf[wp..wp + run].copy_from_slice(&chunk[last..]);
1622            wp += run;
1623        }
1624        writer.write_all(&outbuf[..wp])?;
1625    }
1626    Ok(())
1627}
1628
1629fn delete_multi_memchr_mmap(chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1630    let c0 = chars[0];
1631    let c1 = if chars.len() >= 2 { chars[1] } else { 0 };
1632    let c2 = if chars.len() >= 3 { chars[2] } else { 0 };
1633    let is_three = chars.len() >= 3;
1634
1635    // Parallel path for large data
1636    if data.len() >= PARALLEL_THRESHOLD {
1637        let n_threads = rayon::current_num_threads().max(1);
1638        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1639
1640        let results: Vec<Vec<u8>> = data
1641            .par_chunks(chunk_size)
1642            .map(|chunk| {
1643                let mut out = Vec::with_capacity(chunk.len());
1644                let mut last = 0;
1645                if is_three {
1646                    for pos in memchr::memchr3_iter(c0, c1, c2, chunk) {
1647                        if pos > last {
1648                            out.extend_from_slice(&chunk[last..pos]);
1649                        }
1650                        last = pos + 1;
1651                    }
1652                } else {
1653                    for pos in memchr::memchr2_iter(c0, c1, chunk) {
1654                        if pos > last {
1655                            out.extend_from_slice(&chunk[last..pos]);
1656                        }
1657                        last = pos + 1;
1658                    }
1659                }
1660                if last < chunk.len() {
1661                    out.extend_from_slice(&chunk[last..]);
1662                }
1663                out
1664            })
1665            .collect();
1666
1667        // Use writev to batch all results into fewer syscalls
1668        let slices: Vec<std::io::IoSlice> = results
1669            .iter()
1670            .filter(|r| !r.is_empty())
1671            .map(|r| std::io::IoSlice::new(r))
1672            .collect();
1673        return write_ioslices(writer, &slices);
1674    }
1675
1676    // Single-write fast path: collect all non-deleted spans into one buffer
1677    if data.len() <= SINGLE_WRITE_LIMIT {
1678        let mut outbuf = Vec::with_capacity(data.len());
1679        let mut last = 0;
1680        if is_three {
1681            for pos in memchr::memchr3_iter(c0, c1, c2, data) {
1682                if pos > last {
1683                    outbuf.extend_from_slice(&data[last..pos]);
1684                }
1685                last = pos + 1;
1686            }
1687        } else {
1688            for pos in memchr::memchr2_iter(c0, c1, data) {
1689                if pos > last {
1690                    outbuf.extend_from_slice(&data[last..pos]);
1691                }
1692                last = pos + 1;
1693            }
1694        }
1695        if last < data.len() {
1696            outbuf.extend_from_slice(&data[last..]);
1697        }
1698        return writer.write_all(&outbuf);
1699    }
1700
1701    // Chunked path for large data
1702    let buf_size = data.len().min(BUF_SIZE);
1703    let mut outbuf = vec![0u8; buf_size];
1704
1705    for chunk in data.chunks(buf_size) {
1706        let mut wp = 0;
1707        let mut last = 0;
1708
1709        // Iterate directly over memchr iterator without collecting into Vec<usize>.
1710        // Positions are used exactly once in order, so no intermediate allocation needed.
1711        if is_three {
1712            for pos in memchr::memchr3_iter(c0, c1, c2, chunk) {
1713                if pos > last {
1714                    let run = pos - last;
1715                    outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
1716                    wp += run;
1717                }
1718                last = pos + 1;
1719            }
1720        } else {
1721            for pos in memchr::memchr2_iter(c0, c1, chunk) {
1722                if pos > last {
1723                    let run = pos - last;
1724                    outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
1725                    wp += run;
1726                }
1727                last = pos + 1;
1728            }
1729        }
1730
1731        if last < chunk.len() {
1732            let run = chunk.len() - last;
1733            outbuf[wp..wp + run].copy_from_slice(&chunk[last..]);
1734            wp += run;
1735        }
1736        writer.write_all(&outbuf[..wp])?;
1737    }
1738    Ok(())
1739}
1740
1741/// Delete + squeeze from mmap'd byte slice.
1742///
1743/// For data <= 16MB: delete+squeeze into one buffer, one write syscall.
1744/// For data > 16MB: chunked approach to limit memory.
1745pub fn delete_squeeze_mmap(
1746    delete_chars: &[u8],
1747    squeeze_chars: &[u8],
1748    data: &[u8],
1749    writer: &mut impl Write,
1750) -> io::Result<()> {
1751    let delete_set = build_member_set(delete_chars);
1752    let squeeze_set = build_member_set(squeeze_chars);
1753
1754    // Single-write fast path: delete+squeeze all data in one pass, one write
1755    if data.len() <= SINGLE_WRITE_LIMIT {
1756        let mut outbuf: Vec<u8> = Vec::with_capacity(data.len());
1757        let mut last_squeezed: u16 = 256;
1758        unsafe {
1759            outbuf.set_len(data.len());
1760            let outp: *mut u8 = outbuf.as_mut_ptr();
1761            let inp = data.as_ptr();
1762            let len = data.len();
1763            let mut out_pos = 0;
1764            let mut i = 0;
1765            while i < len {
1766                let b = *inp.add(i);
1767                if is_member(&delete_set, b) {
1768                    i += 1;
1769                    continue;
1770                }
1771                if is_member(&squeeze_set, b) {
1772                    if last_squeezed == b as u16 {
1773                        i += 1;
1774                        continue;
1775                    }
1776                    last_squeezed = b as u16;
1777                } else {
1778                    last_squeezed = 256;
1779                }
1780                *outp.add(out_pos) = b;
1781                out_pos += 1;
1782                i += 1;
1783            }
1784            outbuf.set_len(out_pos);
1785        }
1786        return writer.write_all(&outbuf);
1787    }
1788
1789    // Chunked path for large data
1790    let buf_size = data.len().min(BUF_SIZE);
1791    let mut outbuf = vec![0u8; buf_size];
1792    let mut last_squeezed: u16 = 256;
1793
1794    for chunk in data.chunks(buf_size) {
1795        let mut out_pos = 0;
1796        for &b in chunk {
1797            if is_member(&delete_set, b) {
1798                continue;
1799            }
1800            if is_member(&squeeze_set, b) {
1801                if last_squeezed == b as u16 {
1802                    continue;
1803                }
1804                last_squeezed = b as u16;
1805            } else {
1806                last_squeezed = 256;
1807            }
1808            unsafe {
1809                *outbuf.get_unchecked_mut(out_pos) = b;
1810            }
1811            out_pos += 1;
1812        }
1813        writer.write_all(&outbuf[..out_pos])?;
1814    }
1815    Ok(())
1816}
1817
1818/// Squeeze from mmap'd byte slice.
1819///
1820/// For data >= 2MB: uses rayon parallel processing with boundary fixup.
1821/// For data <= 16MB: squeeze into one buffer, one write syscall.
1822/// For data > 16MB: chunked approach to limit memory.
1823pub fn squeeze_mmap(squeeze_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1824    if squeeze_chars.len() == 1 {
1825        return squeeze_single_mmap(squeeze_chars[0], data, writer);
1826    }
1827    if squeeze_chars.len() == 2 {
1828        return squeeze_multi_mmap::<2>(squeeze_chars, data, writer);
1829    }
1830    if squeeze_chars.len() == 3 {
1831        return squeeze_multi_mmap::<3>(squeeze_chars, data, writer);
1832    }
1833
1834    let member = build_member_set(squeeze_chars);
1835
1836    // Parallel path: squeeze each chunk independently, then fix boundaries
1837    if data.len() >= PARALLEL_THRESHOLD {
1838        let n_threads = rayon::current_num_threads().max(1);
1839        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1840
1841        let results: Vec<Vec<u8>> = data
1842            .par_chunks(chunk_size)
1843            .map(|chunk| squeeze_chunk_bitset(chunk, &member))
1844            .collect();
1845
1846        // Build IoSlice list, fixing boundaries: if chunk N ends with byte B
1847        // and chunk N+1 starts with same byte B, and B is in squeeze set,
1848        // skip the first byte(s) of chunk N+1 that equal B.
1849        // Collect slices for writev to minimize syscalls.
1850        let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
1851        for (idx, result) in results.iter().enumerate() {
1852            if result.is_empty() {
1853                continue;
1854            }
1855            if idx > 0 {
1856                // Check boundary: does previous chunk end with same squeezable byte?
1857                if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
1858                    if is_member(&member, prev_last) {
1859                        // Skip leading bytes in this chunk that equal prev_last
1860                        let skip = result.iter().take_while(|&&b| b == prev_last).count();
1861                        if skip < result.len() {
1862                            slices.push(std::io::IoSlice::new(&result[skip..]));
1863                        }
1864                        continue;
1865                    }
1866                }
1867            }
1868            slices.push(std::io::IoSlice::new(result));
1869        }
1870        return write_ioslices(writer, &slices);
1871    }
1872
1873    // Single-write fast path: squeeze all data into one buffer, one write
1874    if data.len() <= SINGLE_WRITE_LIMIT {
1875        let mut outbuf: Vec<u8> = Vec::with_capacity(data.len());
1876        let mut last_squeezed: u16 = 256;
1877        let len = data.len();
1878        let mut wp = 0;
1879        let mut i = 0;
1880
1881        unsafe {
1882            outbuf.set_len(data.len());
1883            let inp = data.as_ptr();
1884            let outp: *mut u8 = outbuf.as_mut_ptr();
1885
1886            while i < len {
1887                let b = *inp.add(i);
1888                if is_member(&member, b) {
1889                    if last_squeezed != b as u16 {
1890                        *outp.add(wp) = b;
1891                        wp += 1;
1892                        last_squeezed = b as u16;
1893                    }
1894                    i += 1;
1895                    while i < len && *inp.add(i) == b {
1896                        i += 1;
1897                    }
1898                } else {
1899                    last_squeezed = 256;
1900                    *outp.add(wp) = b;
1901                    wp += 1;
1902                    i += 1;
1903                }
1904            }
1905            outbuf.set_len(wp);
1906        }
1907        return writer.write_all(&outbuf);
1908    }
1909
1910    // Chunked path for large data
1911    let buf_size = data.len().min(BUF_SIZE);
1912    let mut outbuf = vec![0u8; buf_size];
1913    let mut last_squeezed: u16 = 256;
1914
1915    for chunk in data.chunks(buf_size) {
1916        let len = chunk.len();
1917        let mut wp = 0;
1918        let mut i = 0;
1919
1920        unsafe {
1921            let inp = chunk.as_ptr();
1922            let outp = outbuf.as_mut_ptr();
1923
1924            while i < len {
1925                let b = *inp.add(i);
1926                if is_member(&member, b) {
1927                    if last_squeezed != b as u16 {
1928                        *outp.add(wp) = b;
1929                        wp += 1;
1930                        last_squeezed = b as u16;
1931                    }
1932                    i += 1;
1933                    while i < len && *inp.add(i) == b {
1934                        i += 1;
1935                    }
1936                } else {
1937                    last_squeezed = 256;
1938                    *outp.add(wp) = b;
1939                    wp += 1;
1940                    i += 1;
1941                }
1942            }
1943        }
1944        writer.write_all(&outbuf[..wp])?;
1945    }
1946    Ok(())
1947}
1948
1949/// Squeeze a single chunk using bitset membership. Returns squeezed output.
1950fn squeeze_chunk_bitset(chunk: &[u8], member: &[u8; 32]) -> Vec<u8> {
1951    let len = chunk.len();
1952    let mut out = Vec::with_capacity(len);
1953    let mut last_squeezed: u16 = 256;
1954    let mut i = 0;
1955
1956    unsafe {
1957        out.set_len(len);
1958        let inp = chunk.as_ptr();
1959        let outp: *mut u8 = out.as_mut_ptr();
1960        let mut wp = 0;
1961
1962        while i < len {
1963            let b = *inp.add(i);
1964            if is_member(member, b) {
1965                if last_squeezed != b as u16 {
1966                    *outp.add(wp) = b;
1967                    wp += 1;
1968                    last_squeezed = b as u16;
1969                }
1970                i += 1;
1971                while i < len && *inp.add(i) == b {
1972                    i += 1;
1973                }
1974            } else {
1975                last_squeezed = 256;
1976                *outp.add(wp) = b;
1977                wp += 1;
1978                i += 1;
1979            }
1980        }
1981        out.set_len(wp);
1982    }
1983    out
1984}
1985
1986fn squeeze_multi_mmap<const N: usize>(
1987    chars: &[u8],
1988    data: &[u8],
1989    writer: &mut impl Write,
1990) -> io::Result<()> {
1991    // Parallel path for large data: squeeze each chunk, fix boundaries with writev
1992    if data.len() >= PARALLEL_THRESHOLD {
1993        let member = build_member_set(chars);
1994        let n_threads = rayon::current_num_threads().max(1);
1995        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1996
1997        let results: Vec<Vec<u8>> = data
1998            .par_chunks(chunk_size)
1999            .map(|chunk| squeeze_chunk_bitset(chunk, &member))
2000            .collect();
2001
2002        // Build IoSlice list, fixing boundaries
2003        let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
2004        for (idx, result) in results.iter().enumerate() {
2005            if result.is_empty() {
2006                continue;
2007            }
2008            if idx > 0 {
2009                if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
2010                    if is_member(&member, prev_last) {
2011                        let skip = result.iter().take_while(|&&b| b == prev_last).count();
2012                        if skip < result.len() {
2013                            slices.push(std::io::IoSlice::new(&result[skip..]));
2014                        }
2015                        continue;
2016                    }
2017                }
2018            }
2019            slices.push(std::io::IoSlice::new(result));
2020        }
2021        return write_ioslices(writer, &slices);
2022    }
2023
2024    let buf_size = data.len().min(BUF_SIZE);
2025    let mut outbuf = vec![0u8; buf_size];
2026    let mut wp = 0;
2027    let mut last_squeezed: u16 = 256;
2028    let mut cursor = 0;
2029
2030    macro_rules! find_next {
2031        ($data:expr) => {
2032            if N == 2 {
2033                memchr::memchr2(chars[0], chars[1], $data)
2034            } else {
2035                memchr::memchr3(chars[0], chars[1], chars[2], $data)
2036            }
2037        };
2038    }
2039
2040    macro_rules! flush_and_copy {
2041        ($src:expr, $len:expr) => {
2042            if wp + $len > buf_size {
2043                writer.write_all(&outbuf[..wp])?;
2044                wp = 0;
2045            }
2046            if $len > buf_size {
2047                writer.write_all($src)?;
2048            } else {
2049                outbuf[wp..wp + $len].copy_from_slice($src);
2050                wp += $len;
2051            }
2052        };
2053    }
2054
2055    while cursor < data.len() {
2056        match find_next!(&data[cursor..]) {
2057            Some(offset) => {
2058                let pos = cursor + offset;
2059                let b = data[pos];
2060                if pos > cursor {
2061                    let span = pos - cursor;
2062                    flush_and_copy!(&data[cursor..pos], span);
2063                    last_squeezed = 256;
2064                }
2065                if last_squeezed != b as u16 {
2066                    if wp >= buf_size {
2067                        writer.write_all(&outbuf[..wp])?;
2068                        wp = 0;
2069                    }
2070                    outbuf[wp] = b;
2071                    wp += 1;
2072                    last_squeezed = b as u16;
2073                }
2074                let mut skip = pos + 1;
2075                while skip < data.len() && data[skip] == b {
2076                    skip += 1;
2077                }
2078                cursor = skip;
2079            }
2080            None => {
2081                let remaining = data.len() - cursor;
2082                flush_and_copy!(&data[cursor..], remaining);
2083                break;
2084            }
2085        }
2086    }
2087    if wp > 0 {
2088        writer.write_all(&outbuf[..wp])?;
2089    }
2090    Ok(())
2091}
2092
2093fn squeeze_single_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2094    if data.is_empty() {
2095        return Ok(());
2096    }
2097
2098    if memchr::memmem::find(data, &[ch, ch]).is_none() {
2099        return writer.write_all(data);
2100    }
2101
2102    // Parallel path: squeeze each chunk, fix boundaries
2103    if data.len() >= PARALLEL_THRESHOLD {
2104        let n_threads = rayon::current_num_threads().max(1);
2105        let chunk_size = (data.len() / n_threads).max(32 * 1024);
2106
2107        let results: Vec<Vec<u8>> = data
2108            .par_chunks(chunk_size)
2109            .map(|chunk| {
2110                let mut out = Vec::with_capacity(chunk.len());
2111                let mut cursor = 0;
2112                while cursor < chunk.len() {
2113                    match memchr::memchr(ch, &chunk[cursor..]) {
2114                        Some(offset) => {
2115                            let pos = cursor + offset;
2116                            if pos > cursor {
2117                                out.extend_from_slice(&chunk[cursor..pos]);
2118                            }
2119                            out.push(ch);
2120                            cursor = pos + 1;
2121                            while cursor < chunk.len() && chunk[cursor] == ch {
2122                                cursor += 1;
2123                            }
2124                        }
2125                        None => {
2126                            out.extend_from_slice(&chunk[cursor..]);
2127                            break;
2128                        }
2129                    }
2130                }
2131                out
2132            })
2133            .collect();
2134
2135        // Build IoSlice list, fixing boundary squeezability.
2136        // Use writev to minimize syscalls.
2137        let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
2138        for (idx, result) in results.iter().enumerate() {
2139            if result.is_empty() {
2140                continue;
2141            }
2142            if idx > 0 {
2143                if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
2144                    if prev_last == ch {
2145                        // Skip leading ch bytes in this chunk result
2146                        let skip = result.iter().take_while(|&&b| b == ch).count();
2147                        if skip < result.len() {
2148                            slices.push(std::io::IoSlice::new(&result[skip..]));
2149                        }
2150                        continue;
2151                    }
2152                }
2153            }
2154            slices.push(std::io::IoSlice::new(result));
2155        }
2156        return write_ioslices(writer, &slices);
2157    }
2158
2159    let buf_size = data.len().min(BUF_SIZE);
2160    let mut outbuf = vec![0u8; buf_size];
2161    let len = data.len();
2162    let mut wp = 0;
2163    let mut cursor = 0;
2164
2165    while cursor < len {
2166        match memchr::memchr(ch, &data[cursor..]) {
2167            Some(offset) => {
2168                let pos = cursor + offset;
2169                let gap = pos - cursor;
2170                if gap > 0 {
2171                    if wp + gap > buf_size {
2172                        writer.write_all(&outbuf[..wp])?;
2173                        wp = 0;
2174                    }
2175                    if gap > buf_size {
2176                        writer.write_all(&data[cursor..pos])?;
2177                    } else {
2178                        outbuf[wp..wp + gap].copy_from_slice(&data[cursor..pos]);
2179                        wp += gap;
2180                    }
2181                }
2182                if wp >= buf_size {
2183                    writer.write_all(&outbuf[..wp])?;
2184                    wp = 0;
2185                }
2186                outbuf[wp] = ch;
2187                wp += 1;
2188                cursor = pos + 1;
2189                while cursor < len && data[cursor] == ch {
2190                    cursor += 1;
2191                }
2192            }
2193            None => {
2194                let remaining = len - cursor;
2195                if remaining > 0 {
2196                    if wp + remaining > buf_size {
2197                        writer.write_all(&outbuf[..wp])?;
2198                        wp = 0;
2199                    }
2200                    if remaining > buf_size {
2201                        writer.write_all(&data[cursor..])?;
2202                    } else {
2203                        outbuf[wp..wp + remaining].copy_from_slice(&data[cursor..]);
2204                        wp += remaining;
2205                    }
2206                }
2207                break;
2208            }
2209        }
2210    }
2211
2212    if wp > 0 {
2213        writer.write_all(&outbuf[..wp])?;
2214    }
2215    Ok(())
2216}