Skip to main content

coreutils_rs/tr/
core.rs

1use std::io::{self, Read, Write};
2
3use rayon::prelude::*;
4
5/// Maximum IoSlice entries per write_vectored batch.
6/// Linux UIO_MAXIOV is 1024; we use that as our batch limit.
7const MAX_IOV: usize = 1024;
8
9/// Main processing buffer: 4MB (fits in L3 cache, avoids cache thrashing).
10const BUF_SIZE: usize = 4 * 1024 * 1024;
11
12/// Stream buffer: 8MB — larger buffer = fewer read/write syscalls for streaming.
13const STREAM_BUF: usize = 8 * 1024 * 1024;
14
15/// Minimum data size to engage rayon parallel processing for mmap paths.
16/// Below this, single-threaded is faster due to thread pool overhead.
17const PARALLEL_THRESHOLD: usize = 2 * 1024 * 1024;
18
19/// Write multiple IoSlice buffers using write_vectored, batching into MAX_IOV-sized groups.
20/// Falls back to write_all per slice for partial writes.
21#[inline]
22fn write_ioslices(writer: &mut impl Write, slices: &[std::io::IoSlice]) -> io::Result<()> {
23    if slices.is_empty() {
24        return Ok(());
25    }
26    for batch in slices.chunks(MAX_IOV) {
27        let total: usize = batch.iter().map(|s| s.len()).sum();
28        match writer.write_vectored(batch) {
29            Ok(n) if n >= total => continue,
30            Ok(mut written) => {
31                // Partial write: fall back to write_all per remaining slice
32                for slice in batch {
33                    let slen = slice.len();
34                    if written >= slen {
35                        written -= slen;
36                        continue;
37                    }
38                    if written > 0 {
39                        writer.write_all(&slice[written..])?;
40                        written = 0;
41                    } else {
42                        writer.write_all(slice)?;
43                    }
44                }
45            }
46            Err(e) => return Err(e),
47        }
48    }
49    Ok(())
50}
51
52/// Allocate a Vec<u8> of given length without zero-initialization.
53/// SAFETY: Caller must write all bytes before reading them.
54#[inline]
55#[allow(clippy::uninit_vec)]
56fn alloc_uninit_vec(len: usize) -> Vec<u8> {
57    let mut v = Vec::with_capacity(len);
58    // SAFETY: u8 has no drop, no invalid bit patterns; caller will overwrite before reading
59    unsafe {
60        v.set_len(len);
61    }
62    v
63}
64
65/// Build a 256-byte lookup table mapping set1[i] -> set2[i].
66#[inline]
67fn build_translate_table(set1: &[u8], set2: &[u8]) -> [u8; 256] {
68    let mut table: [u8; 256] = std::array::from_fn(|i| i as u8);
69    let last = set2.last().copied();
70    for (i, &from) in set1.iter().enumerate() {
71        table[from as usize] = if i < set2.len() {
72            set2[i]
73        } else {
74            last.unwrap_or(from)
75        };
76    }
77    table
78}
79
80/// Build a 256-bit (32-byte) membership set for O(1) byte lookup.
81#[inline]
82fn build_member_set(chars: &[u8]) -> [u8; 32] {
83    let mut set = [0u8; 32];
84    for &ch in chars {
85        set[ch as usize >> 3] |= 1 << (ch & 7);
86    }
87    set
88}
89
90#[inline(always)]
91fn is_member(set: &[u8; 32], ch: u8) -> bool {
92    unsafe { (*set.get_unchecked(ch as usize >> 3) & (1 << (ch & 7))) != 0 }
93}
94
95/// Translate bytes in-place using a 256-byte lookup table.
96#[inline(always)]
97fn translate_inplace(data: &mut [u8], table: &[u8; 256]) {
98    for b in data.iter_mut() {
99        *b = unsafe { *table.get_unchecked(*b as usize) };
100    }
101}
102
103/// Translate bytes from source to destination using a 256-byte lookup table.
104#[inline(always)]
105fn translate_to(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
106    debug_assert!(dst.len() >= src.len());
107    unsafe {
108        let sp = src.as_ptr();
109        let dp = dst.as_mut_ptr();
110        let len = src.len();
111        let mut i = 0;
112        while i + 8 <= len {
113            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
114            *dp.add(i + 1) = *table.get_unchecked(*sp.add(i + 1) as usize);
115            *dp.add(i + 2) = *table.get_unchecked(*sp.add(i + 2) as usize);
116            *dp.add(i + 3) = *table.get_unchecked(*sp.add(i + 3) as usize);
117            *dp.add(i + 4) = *table.get_unchecked(*sp.add(i + 4) as usize);
118            *dp.add(i + 5) = *table.get_unchecked(*sp.add(i + 5) as usize);
119            *dp.add(i + 6) = *table.get_unchecked(*sp.add(i + 6) as usize);
120            *dp.add(i + 7) = *table.get_unchecked(*sp.add(i + 7) as usize);
121            i += 8;
122        }
123        while i < len {
124            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
125            i += 1;
126        }
127    }
128}
129
130// ============================================================================
131// SIMD range translation (x86_64)
132// ============================================================================
133
134/// Detect if the translate table is a single contiguous range with constant offset.
135/// Returns Some((lo, hi, offset)) if all non-identity entries form [lo..=hi] with
136/// table[i] = i + offset for all i in [lo, hi].
137#[inline]
138fn detect_range_offset(table: &[u8; 256]) -> Option<(u8, u8, i8)> {
139    let mut lo: Option<u8> = None;
140    let mut hi = 0u8;
141    let mut offset = 0i16;
142
143    for i in 0..256 {
144        if table[i] != i as u8 {
145            let diff = table[i] as i16 - i as i16;
146            match lo {
147                None => {
148                    lo = Some(i as u8);
149                    hi = i as u8;
150                    offset = diff;
151                }
152                Some(_) => {
153                    if diff != offset || i as u8 != hi.wrapping_add(1) {
154                        return None;
155                    }
156                    hi = i as u8;
157                }
158            }
159        }
160    }
161
162    lo.map(|l| (l, hi, offset as i8))
163}
164
165/// SIMD-accelerated range translation for mmap'd data.
166/// For tables where only a contiguous range [lo..=hi] is translated by a constant offset,
167/// uses AVX2 (32 bytes/iter) or SSE2 (16 bytes/iter) vectorized arithmetic.
168#[cfg(target_arch = "x86_64")]
169fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
170    if is_x86_feature_detected!("avx2") {
171        unsafe { translate_range_avx2(src, dst, lo, hi, offset) };
172    } else {
173        unsafe { translate_range_sse2(src, dst, lo, hi, offset) };
174    }
175}
176
177#[cfg(target_arch = "x86_64")]
178#[target_feature(enable = "avx2")]
179unsafe fn translate_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
180    use std::arch::x86_64::*;
181
182    unsafe {
183        let range = hi - lo;
184        // Bias: shift range so lo maps to -128 (signed min).
185        // For input in [lo, hi]: biased = input + (0x80 - lo) is in [-128, -128+range].
186        // For input < lo: biased wraps to large positive (signed), > threshold.
187        // For input > hi: biased > -128+range, > threshold.
188        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
189        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
190        let offset_v = _mm256_set1_epi8(offset);
191        let zero = _mm256_setzero_si256();
192
193        let len = src.len();
194        let mut i = 0;
195
196        while i + 32 <= len {
197            let input = _mm256_loadu_si256(src.as_ptr().add(i) as *const _);
198            let biased = _mm256_add_epi8(input, bias_v);
199            // gt = 0xFF where biased > threshold (OUT of range)
200            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
201            // mask = 0xFF where IN range (NOT gt)
202            let mask = _mm256_cmpeq_epi8(gt, zero);
203            let offset_masked = _mm256_and_si256(mask, offset_v);
204            let result = _mm256_add_epi8(input, offset_masked);
205            _mm256_storeu_si256(dst.as_mut_ptr().add(i) as *mut _, result);
206            i += 32;
207        }
208
209        // SSE2 tail for 16-byte remainder
210        if i + 16 <= len {
211            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
212            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
213            let offset_v128 = _mm_set1_epi8(offset);
214            let zero128 = _mm_setzero_si128();
215
216            let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
217            let biased = _mm_add_epi8(input, bias_v128);
218            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
219            let mask = _mm_cmpeq_epi8(gt, zero128);
220            let offset_masked = _mm_and_si128(mask, offset_v128);
221            let result = _mm_add_epi8(input, offset_masked);
222            _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
223            i += 16;
224        }
225
226        // Scalar tail
227        while i < len {
228            let b = *src.get_unchecked(i);
229            *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
230                b.wrapping_add(offset as u8)
231            } else {
232                b
233            };
234            i += 1;
235        }
236    }
237}
238
239#[cfg(target_arch = "x86_64")]
240#[target_feature(enable = "sse2")]
241unsafe fn translate_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
242    use std::arch::x86_64::*;
243
244    unsafe {
245        let range = hi - lo;
246        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
247        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
248        let offset_v = _mm_set1_epi8(offset);
249        let zero = _mm_setzero_si128();
250
251        let len = src.len();
252        let mut i = 0;
253
254        while i + 16 <= len {
255            let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
256            let biased = _mm_add_epi8(input, bias_v);
257            let gt = _mm_cmpgt_epi8(biased, threshold_v);
258            let mask = _mm_cmpeq_epi8(gt, zero);
259            let offset_masked = _mm_and_si128(mask, offset_v);
260            let result = _mm_add_epi8(input, offset_masked);
261            _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
262            i += 16;
263        }
264
265        while i < len {
266            let b = *src.get_unchecked(i);
267            *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
268                b.wrapping_add(offset as u8)
269            } else {
270                b
271            };
272            i += 1;
273        }
274    }
275}
276
277/// Scalar range translation fallback for non-x86_64.
278#[cfg(not(target_arch = "x86_64"))]
279fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
280    for (i, &b) in src.iter().enumerate() {
281        dst[i] = if b >= lo && b <= hi {
282            b.wrapping_add(offset as u8)
283        } else {
284            b
285        };
286    }
287}
288
289// ============================================================================
290// In-place SIMD range translation (saves one buffer allocation in streaming)
291// ============================================================================
292
293/// In-place SIMD-accelerated range translation.
294/// Translates bytes in [lo..=hi] by adding `offset`, leaving others unchanged.
295/// Operates on the buffer in-place, eliminating the need for a separate output buffer.
296#[cfg(target_arch = "x86_64")]
297fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
298    if is_x86_feature_detected!("avx2") {
299        unsafe { translate_range_avx2_inplace(data, lo, hi, offset) };
300    } else {
301        unsafe { translate_range_sse2_inplace(data, lo, hi, offset) };
302    }
303}
304
305#[cfg(target_arch = "x86_64")]
306#[target_feature(enable = "avx2")]
307unsafe fn translate_range_avx2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
308    use std::arch::x86_64::*;
309
310    unsafe {
311        let range = hi - lo;
312        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
313        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
314        let offset_v = _mm256_set1_epi8(offset);
315        let zero = _mm256_setzero_si256();
316
317        let len = data.len();
318        let ptr = data.as_mut_ptr();
319        let mut i = 0;
320
321        while i + 32 <= len {
322            let input = _mm256_loadu_si256(ptr.add(i) as *const _);
323            let biased = _mm256_add_epi8(input, bias_v);
324            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
325            let mask = _mm256_cmpeq_epi8(gt, zero);
326            let offset_masked = _mm256_and_si256(mask, offset_v);
327            let result = _mm256_add_epi8(input, offset_masked);
328            _mm256_storeu_si256(ptr.add(i) as *mut _, result);
329            i += 32;
330        }
331
332        if i + 16 <= len {
333            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
334            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
335            let offset_v128 = _mm_set1_epi8(offset);
336            let zero128 = _mm_setzero_si128();
337
338            let input = _mm_loadu_si128(ptr.add(i) as *const _);
339            let biased = _mm_add_epi8(input, bias_v128);
340            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
341            let mask = _mm_cmpeq_epi8(gt, zero128);
342            let offset_masked = _mm_and_si128(mask, offset_v128);
343            let result = _mm_add_epi8(input, offset_masked);
344            _mm_storeu_si128(ptr.add(i) as *mut _, result);
345            i += 16;
346        }
347
348        while i < len {
349            let b = *ptr.add(i);
350            *ptr.add(i) = if b >= lo && b <= hi {
351                b.wrapping_add(offset as u8)
352            } else {
353                b
354            };
355            i += 1;
356        }
357    }
358}
359
360#[cfg(target_arch = "x86_64")]
361#[target_feature(enable = "sse2")]
362unsafe fn translate_range_sse2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
363    use std::arch::x86_64::*;
364
365    unsafe {
366        let range = hi - lo;
367        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
368        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
369        let offset_v = _mm_set1_epi8(offset);
370        let zero = _mm_setzero_si128();
371
372        let len = data.len();
373        let ptr = data.as_mut_ptr();
374        let mut i = 0;
375
376        while i + 16 <= len {
377            let input = _mm_loadu_si128(ptr.add(i) as *const _);
378            let biased = _mm_add_epi8(input, bias_v);
379            let gt = _mm_cmpgt_epi8(biased, threshold_v);
380            let mask = _mm_cmpeq_epi8(gt, zero);
381            let offset_masked = _mm_and_si128(mask, offset_v);
382            let result = _mm_add_epi8(input, offset_masked);
383            _mm_storeu_si128(ptr.add(i) as *mut _, result);
384            i += 16;
385        }
386
387        while i < len {
388            let b = *ptr.add(i);
389            *ptr.add(i) = if b >= lo && b <= hi {
390                b.wrapping_add(offset as u8)
391            } else {
392                b
393            };
394            i += 1;
395        }
396    }
397}
398
399#[cfg(not(target_arch = "x86_64"))]
400fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
401    for b in data.iter_mut() {
402        if *b >= lo && *b <= hi {
403            *b = b.wrapping_add(offset as u8);
404        }
405    }
406}
407
408// ============================================================================
409// Streaming functions (Read + Write)
410// ============================================================================
411
412pub fn translate(
413    set1: &[u8],
414    set2: &[u8],
415    reader: &mut impl Read,
416    writer: &mut impl Write,
417) -> io::Result<()> {
418    let table = build_translate_table(set1, set2);
419
420    // Try SIMD fast path for range translations (in-place, single buffer)
421    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
422        return translate_range_stream(lo, hi, offset, reader, writer);
423    }
424
425    // General case: use separate src/dst buffers with 8x-unrolled translate_to.
426    // This avoids the read-modify-write cache penalty of in-place translation:
427    // reading and writing the same cache line forces store-to-load forwarding stalls.
428    // With separate buffers, the CPU can pipeline reads from src while writing to dst.
429    let mut src = vec![0u8; STREAM_BUF];
430    let mut dst = alloc_uninit_vec(STREAM_BUF);
431    loop {
432        let n = read_full(reader, &mut src)?;
433        if n == 0 {
434            break;
435        }
436        translate_to(&src[..n], &mut dst[..n], &table);
437        writer.write_all(&dst[..n])?;
438    }
439    Ok(())
440}
441
442/// Streaming SIMD range translation — single buffer, in-place transform.
443/// Saves 16MB allocation + memcpy vs separate src/dst buffers.
444fn translate_range_stream(
445    lo: u8,
446    hi: u8,
447    offset: i8,
448    reader: &mut impl Read,
449    writer: &mut impl Write,
450) -> io::Result<()> {
451    let mut buf = vec![0u8; STREAM_BUF];
452    loop {
453        let n = read_full(reader, &mut buf)?;
454        if n == 0 {
455            break;
456        }
457        translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
458        writer.write_all(&buf[..n])?;
459    }
460    Ok(())
461}
462
463/// Read as many bytes as possible into buf, retrying on partial reads.
464#[inline]
465fn read_full(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
466    let mut total = 0;
467    while total < buf.len() {
468        match reader.read(&mut buf[total..]) {
469            Ok(0) => break,
470            Ok(n) => total += n,
471            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
472            Err(e) => return Err(e),
473        }
474    }
475    Ok(total)
476}
477
478pub fn translate_squeeze(
479    set1: &[u8],
480    set2: &[u8],
481    reader: &mut impl Read,
482    writer: &mut impl Write,
483) -> io::Result<()> {
484    let table = build_translate_table(set1, set2);
485    let squeeze_set = build_member_set(set2);
486
487    // Two-pass optimization for range translations:
488    // Pass 1: SIMD range translate in-place (10x faster than scalar table lookup)
489    // Pass 2: scalar squeeze (inherently sequential due to state dependency)
490    // Even though it's two passes, the translate pass is so much faster with SIMD
491    // that the total is still a net win.
492    let range_info = detect_range_offset(&table);
493
494    let mut buf = vec![0u8; STREAM_BUF];
495    let mut last_squeezed: u16 = 256;
496
497    loop {
498        let n = read_full(reader, &mut buf)?;
499        if n == 0 {
500            break;
501        }
502        // Pass 1: translate
503        if let Some((lo, hi, offset)) = range_info {
504            translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
505        } else {
506            translate_inplace(&mut buf[..n], &table);
507        }
508        // Pass 2: squeeze in-place
509        let mut wp = 0;
510        unsafe {
511            let ptr = buf.as_mut_ptr();
512            for i in 0..n {
513                let b = *ptr.add(i);
514                if is_member(&squeeze_set, b) {
515                    if last_squeezed == b as u16 {
516                        continue;
517                    }
518                    last_squeezed = b as u16;
519                } else {
520                    last_squeezed = 256;
521                }
522                *ptr.add(wp) = b;
523                wp += 1;
524            }
525        }
526        writer.write_all(&buf[..wp])?;
527    }
528    Ok(())
529}
530
531pub fn delete(
532    delete_chars: &[u8],
533    reader: &mut impl Read,
534    writer: &mut impl Write,
535) -> io::Result<()> {
536    if delete_chars.len() == 1 {
537        return delete_single_streaming(delete_chars[0], reader, writer);
538    }
539    if delete_chars.len() <= 3 {
540        return delete_multi_streaming(delete_chars, reader, writer);
541    }
542
543    let member = build_member_set(delete_chars);
544    let mut buf = vec![0u8; STREAM_BUF];
545
546    loop {
547        let n = read_full(reader, &mut buf)?;
548        if n == 0 {
549            break;
550        }
551        let mut wp = 0;
552        unsafe {
553            let ptr = buf.as_mut_ptr();
554            let mut i = 0;
555            while i + 8 <= n {
556                let b0 = *ptr.add(i);
557                let b1 = *ptr.add(i + 1);
558                let b2 = *ptr.add(i + 2);
559                let b3 = *ptr.add(i + 3);
560                let b4 = *ptr.add(i + 4);
561                let b5 = *ptr.add(i + 5);
562                let b6 = *ptr.add(i + 6);
563                let b7 = *ptr.add(i + 7);
564
565                if !is_member(&member, b0) {
566                    *ptr.add(wp) = b0;
567                    wp += 1;
568                }
569                if !is_member(&member, b1) {
570                    *ptr.add(wp) = b1;
571                    wp += 1;
572                }
573                if !is_member(&member, b2) {
574                    *ptr.add(wp) = b2;
575                    wp += 1;
576                }
577                if !is_member(&member, b3) {
578                    *ptr.add(wp) = b3;
579                    wp += 1;
580                }
581                if !is_member(&member, b4) {
582                    *ptr.add(wp) = b4;
583                    wp += 1;
584                }
585                if !is_member(&member, b5) {
586                    *ptr.add(wp) = b5;
587                    wp += 1;
588                }
589                if !is_member(&member, b6) {
590                    *ptr.add(wp) = b6;
591                    wp += 1;
592                }
593                if !is_member(&member, b7) {
594                    *ptr.add(wp) = b7;
595                    wp += 1;
596                }
597                i += 8;
598            }
599            while i < n {
600                let b = *ptr.add(i);
601                if !is_member(&member, b) {
602                    *ptr.add(wp) = b;
603                    wp += 1;
604                }
605                i += 1;
606            }
607        }
608        writer.write_all(&buf[..wp])?;
609    }
610    Ok(())
611}
612
613fn delete_single_streaming(
614    ch: u8,
615    reader: &mut impl Read,
616    writer: &mut impl Write,
617) -> io::Result<()> {
618    let mut buf = vec![0u8; STREAM_BUF];
619    loop {
620        let n = match reader.read(&mut buf) {
621            Ok(0) => break,
622            Ok(n) => n,
623            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
624            Err(e) => return Err(e),
625        };
626        let mut wp = 0;
627        let mut i = 0;
628        while i < n {
629            match memchr::memchr(ch, &buf[i..n]) {
630                Some(offset) => {
631                    if offset > 0 {
632                        if wp != i {
633                            unsafe {
634                                std::ptr::copy(
635                                    buf.as_ptr().add(i),
636                                    buf.as_mut_ptr().add(wp),
637                                    offset,
638                                );
639                            }
640                        }
641                        wp += offset;
642                    }
643                    i += offset + 1;
644                }
645                None => {
646                    let run_len = n - i;
647                    if run_len > 0 {
648                        if wp != i {
649                            unsafe {
650                                std::ptr::copy(
651                                    buf.as_ptr().add(i),
652                                    buf.as_mut_ptr().add(wp),
653                                    run_len,
654                                );
655                            }
656                        }
657                        wp += run_len;
658                    }
659                    break;
660                }
661            }
662        }
663        writer.write_all(&buf[..wp])?;
664    }
665    Ok(())
666}
667
668fn delete_multi_streaming(
669    chars: &[u8],
670    reader: &mut impl Read,
671    writer: &mut impl Write,
672) -> io::Result<()> {
673    let mut buf = vec![0u8; STREAM_BUF];
674    loop {
675        let n = match reader.read(&mut buf) {
676            Ok(0) => break,
677            Ok(n) => n,
678            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
679            Err(e) => return Err(e),
680        };
681        let mut wp = 0;
682        let mut i = 0;
683        while i < n {
684            let found = if chars.len() == 2 {
685                memchr::memchr2(chars[0], chars[1], &buf[i..n])
686            } else {
687                memchr::memchr3(chars[0], chars[1], chars[2], &buf[i..n])
688            };
689            match found {
690                Some(offset) => {
691                    if offset > 0 {
692                        if wp != i {
693                            unsafe {
694                                std::ptr::copy(
695                                    buf.as_ptr().add(i),
696                                    buf.as_mut_ptr().add(wp),
697                                    offset,
698                                );
699                            }
700                        }
701                        wp += offset;
702                    }
703                    i += offset + 1;
704                }
705                None => {
706                    let run_len = n - i;
707                    if run_len > 0 {
708                        if wp != i {
709                            unsafe {
710                                std::ptr::copy(
711                                    buf.as_ptr().add(i),
712                                    buf.as_mut_ptr().add(wp),
713                                    run_len,
714                                );
715                            }
716                        }
717                        wp += run_len;
718                    }
719                    break;
720                }
721            }
722        }
723        writer.write_all(&buf[..wp])?;
724    }
725    Ok(())
726}
727
728pub fn delete_squeeze(
729    delete_chars: &[u8],
730    squeeze_chars: &[u8],
731    reader: &mut impl Read,
732    writer: &mut impl Write,
733) -> io::Result<()> {
734    let delete_set = build_member_set(delete_chars);
735    let squeeze_set = build_member_set(squeeze_chars);
736    let mut buf = vec![0u8; STREAM_BUF];
737    let mut last_squeezed: u16 = 256;
738
739    loop {
740        let n = read_full(reader, &mut buf)?;
741        if n == 0 {
742            break;
743        }
744        let mut wp = 0;
745        unsafe {
746            let ptr = buf.as_mut_ptr();
747            for i in 0..n {
748                let b = *ptr.add(i);
749                if is_member(&delete_set, b) {
750                    continue;
751                }
752                if is_member(&squeeze_set, b) {
753                    if last_squeezed == b as u16 {
754                        continue;
755                    }
756                    last_squeezed = b as u16;
757                } else {
758                    last_squeezed = 256;
759                }
760                *ptr.add(wp) = b;
761                wp += 1;
762            }
763        }
764        writer.write_all(&buf[..wp])?;
765    }
766    Ok(())
767}
768
769pub fn squeeze(
770    squeeze_chars: &[u8],
771    reader: &mut impl Read,
772    writer: &mut impl Write,
773) -> io::Result<()> {
774    if squeeze_chars.len() == 1 {
775        return squeeze_single_stream(squeeze_chars[0], reader, writer);
776    }
777
778    let member = build_member_set(squeeze_chars);
779    let mut buf = vec![0u8; STREAM_BUF];
780    let mut last_squeezed: u16 = 256;
781
782    loop {
783        let n = read_full(reader, &mut buf)?;
784        if n == 0 {
785            break;
786        }
787        let mut wp = 0;
788        unsafe {
789            let ptr = buf.as_mut_ptr();
790            for i in 0..n {
791                let b = *ptr.add(i);
792                if is_member(&member, b) {
793                    if last_squeezed == b as u16 {
794                        continue;
795                    }
796                    last_squeezed = b as u16;
797                } else {
798                    last_squeezed = 256;
799                }
800                *ptr.add(wp) = b;
801                wp += 1;
802            }
803        }
804        writer.write_all(&buf[..wp])?;
805    }
806    Ok(())
807}
808
809fn squeeze_single_stream(
810    ch: u8,
811    reader: &mut impl Read,
812    writer: &mut impl Write,
813) -> io::Result<()> {
814    let mut buf = vec![0u8; STREAM_BUF];
815    let mut was_squeeze_char = false;
816
817    loop {
818        let n = match reader.read(&mut buf) {
819            Ok(0) => break,
820            Ok(n) => n,
821            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
822            Err(e) => return Err(e),
823        };
824
825        let mut wp = 0;
826        let mut i = 0;
827
828        while i < n {
829            if was_squeeze_char && buf[i] == ch {
830                i += 1;
831                while i < n && buf[i] == ch {
832                    i += 1;
833                }
834                if i >= n {
835                    break;
836                }
837            }
838
839            match memchr::memchr(ch, &buf[i..n]) {
840                Some(offset) => {
841                    let run_len = offset;
842                    if run_len > 0 {
843                        if wp != i {
844                            unsafe {
845                                std::ptr::copy(
846                                    buf.as_ptr().add(i),
847                                    buf.as_mut_ptr().add(wp),
848                                    run_len,
849                                );
850                            }
851                        }
852                        wp += run_len;
853                    }
854                    i += run_len;
855
856                    unsafe {
857                        *buf.as_mut_ptr().add(wp) = ch;
858                    }
859                    wp += 1;
860                    was_squeeze_char = true;
861                    i += 1;
862                    while i < n && buf[i] == ch {
863                        i += 1;
864                    }
865                }
866                None => {
867                    let run_len = n - i;
868                    if run_len > 0 {
869                        if wp != i {
870                            unsafe {
871                                std::ptr::copy(
872                                    buf.as_ptr().add(i),
873                                    buf.as_mut_ptr().add(wp),
874                                    run_len,
875                                );
876                            }
877                        }
878                        wp += run_len;
879                    }
880                    was_squeeze_char = false;
881                    break;
882                }
883            }
884        }
885
886        writer.write_all(&buf[..wp])?;
887    }
888    Ok(())
889}
890
891// ============================================================================
892// Mmap-based functions (zero-copy input from byte slice)
893// ============================================================================
894
895/// Maximum data size for single-allocation translate approach.
896/// Below this limit, translate ALL data into one buffer and do a single write_all.
897/// Above this, use chunked approach to limit memory usage.
898const SINGLE_WRITE_LIMIT: usize = 16 * 1024 * 1024;
899
900/// Translate bytes from an mmap'd byte slice.
901/// Detects single-range translations (e.g., a-z to A-Z) and uses SIMD vectorized
902/// arithmetic (AVX2: 32 bytes/iter, SSE2: 16 bytes/iter) for those cases.
903/// Falls back to scalar 256-byte table lookup for general translations.
904///
905/// For data >= 2MB: uses rayon parallel processing across multiple cores.
906/// For data <= 16MB: single allocation + single write_all (1 syscall).
907/// For data > 16MB: chunked approach to limit memory (N syscalls where N = data/4MB).
908pub fn translate_mmap(
909    set1: &[u8],
910    set2: &[u8],
911    data: &[u8],
912    writer: &mut impl Write,
913) -> io::Result<()> {
914    let table = build_translate_table(set1, set2);
915
916    // Check if table is identity — pure passthrough
917    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
918    if is_identity {
919        return writer.write_all(data);
920    }
921
922    // Try SIMD fast path for single-range constant-offset translations
923    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
924        return translate_mmap_range(data, writer, lo, hi, offset);
925    }
926
927    // General case: table lookup (with parallel processing for large data)
928    translate_mmap_table(data, writer, &table)
929}
930
931/// SIMD range translate for mmap data, with rayon parallel processing.
932fn translate_mmap_range(
933    data: &[u8],
934    writer: &mut impl Write,
935    lo: u8,
936    hi: u8,
937    offset: i8,
938) -> io::Result<()> {
939    // Parallel path: split data into chunks, translate each in parallel
940    if data.len() >= PARALLEL_THRESHOLD {
941        let mut buf = alloc_uninit_vec(data.len());
942        let n_threads = rayon::current_num_threads().max(1);
943        let chunk_size = (data.len() / n_threads).max(32 * 1024);
944
945        // Process chunks in parallel: each thread writes to its slice of buf
946        data.par_chunks(chunk_size)
947            .zip(buf.par_chunks_mut(chunk_size))
948            .for_each(|(src_chunk, dst_chunk)| {
949                translate_range_simd(src_chunk, &mut dst_chunk[..src_chunk.len()], lo, hi, offset);
950            });
951
952        return writer.write_all(&buf);
953    }
954
955    // Small data: single-threaded SIMD
956    if data.len() <= SINGLE_WRITE_LIMIT {
957        let mut buf = alloc_uninit_vec(data.len());
958        translate_range_simd(data, &mut buf, lo, hi, offset);
959        return writer.write_all(&buf);
960    }
961    // Chunked path for large data (shouldn't happen since PARALLEL_THRESHOLD < SINGLE_WRITE_LIMIT)
962    let mut buf = alloc_uninit_vec(BUF_SIZE);
963    for chunk in data.chunks(BUF_SIZE) {
964        translate_range_simd(chunk, &mut buf[..chunk.len()], lo, hi, offset);
965        writer.write_all(&buf[..chunk.len()])?;
966    }
967    Ok(())
968}
969
970/// General table-lookup translate for mmap data, with rayon parallel processing.
971fn translate_mmap_table(data: &[u8], writer: &mut impl Write, table: &[u8; 256]) -> io::Result<()> {
972    // Parallel path: split data into chunks, translate each in parallel
973    if data.len() >= PARALLEL_THRESHOLD {
974        let mut buf = alloc_uninit_vec(data.len());
975        let n_threads = rayon::current_num_threads().max(1);
976        let chunk_size = (data.len() / n_threads).max(32 * 1024);
977
978        data.par_chunks(chunk_size)
979            .zip(buf.par_chunks_mut(chunk_size))
980            .for_each(|(src_chunk, dst_chunk)| {
981                translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], table);
982            });
983
984        return writer.write_all(&buf);
985    }
986
987    // Small data: single-threaded
988    if data.len() <= SINGLE_WRITE_LIMIT {
989        let mut buf = alloc_uninit_vec(data.len());
990        translate_to(data, &mut buf, table);
991        return writer.write_all(&buf);
992    }
993    let mut buf = alloc_uninit_vec(BUF_SIZE);
994    for chunk in data.chunks(BUF_SIZE) {
995        translate_to(chunk, &mut buf[..chunk.len()], table);
996        writer.write_all(&buf[..chunk.len()])?;
997    }
998    Ok(())
999}
1000
1001/// Translate + squeeze from mmap'd byte slice.
1002///
1003/// For data >= 2MB: two-phase approach: parallel translate, then sequential squeeze.
1004/// For data <= 16MB: single-pass translate+squeeze into one buffer, one write syscall.
1005/// For data > 16MB: chunked approach to limit memory.
1006pub fn translate_squeeze_mmap(
1007    set1: &[u8],
1008    set2: &[u8],
1009    data: &[u8],
1010    writer: &mut impl Write,
1011) -> io::Result<()> {
1012    let table = build_translate_table(set1, set2);
1013    let squeeze_set = build_member_set(set2);
1014
1015    // For large data: two-phase approach
1016    // Phase 1: parallel translate into buffer
1017    // Phase 2: sequential squeeze IN-PLACE on the translated buffer
1018    //          (squeeze only removes bytes, never grows, so no second allocation needed)
1019    if data.len() >= PARALLEL_THRESHOLD {
1020        // Phase 1: parallel translate
1021        let mut translated = alloc_uninit_vec(data.len());
1022        let range_info = detect_range_offset(&table);
1023        let n_threads = rayon::current_num_threads().max(1);
1024        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1025
1026        if let Some((lo, hi, offset)) = range_info {
1027            data.par_chunks(chunk_size)
1028                .zip(translated.par_chunks_mut(chunk_size))
1029                .for_each(|(src_chunk, dst_chunk)| {
1030                    translate_range_simd(
1031                        src_chunk,
1032                        &mut dst_chunk[..src_chunk.len()],
1033                        lo,
1034                        hi,
1035                        offset,
1036                    );
1037                });
1038        } else {
1039            data.par_chunks(chunk_size)
1040                .zip(translated.par_chunks_mut(chunk_size))
1041                .for_each(|(src_chunk, dst_chunk)| {
1042                    translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], &table);
1043                });
1044        }
1045
1046        // Phase 2: squeeze in-place on the translated buffer.
1047        // Since squeeze only removes bytes (never grows), we can read ahead and
1048        // compact into the same buffer, saving a full data.len() heap allocation.
1049        let mut last_squeezed: u16 = 256;
1050        let len = translated.len();
1051        let mut wp = 0;
1052        unsafe {
1053            let ptr = translated.as_mut_ptr();
1054            let mut i = 0;
1055            while i < len {
1056                let b = *ptr.add(i);
1057                if is_member(&squeeze_set, b) {
1058                    if last_squeezed == b as u16 {
1059                        i += 1;
1060                        continue;
1061                    }
1062                    last_squeezed = b as u16;
1063                } else {
1064                    last_squeezed = 256;
1065                }
1066                *ptr.add(wp) = b;
1067                wp += 1;
1068                i += 1;
1069            }
1070        }
1071        return writer.write_all(&translated[..wp]);
1072    }
1073
1074    // Single-write fast path: translate+squeeze all data in one pass, one write
1075    if data.len() <= SINGLE_WRITE_LIMIT {
1076        let mut buf: Vec<u8> = Vec::with_capacity(data.len());
1077        let mut last_squeezed: u16 = 256;
1078        unsafe {
1079            buf.set_len(data.len());
1080            let outp: *mut u8 = buf.as_mut_ptr();
1081            let inp = data.as_ptr();
1082            let len = data.len();
1083            let mut wp = 0;
1084            let mut i = 0;
1085            while i < len {
1086                let translated = *table.get_unchecked(*inp.add(i) as usize);
1087                if is_member(&squeeze_set, translated) {
1088                    if last_squeezed == translated as u16 {
1089                        i += 1;
1090                        continue;
1091                    }
1092                    last_squeezed = translated as u16;
1093                } else {
1094                    last_squeezed = 256;
1095                }
1096                *outp.add(wp) = translated;
1097                wp += 1;
1098                i += 1;
1099            }
1100            buf.set_len(wp);
1101        }
1102        return writer.write_all(&buf);
1103    }
1104
1105    // Chunked path for large data
1106    let buf_size = data.len().min(BUF_SIZE);
1107    let mut buf = vec![0u8; buf_size];
1108    let mut last_squeezed: u16 = 256;
1109
1110    for chunk in data.chunks(buf_size) {
1111        translate_to(chunk, &mut buf[..chunk.len()], &table);
1112        let mut wp = 0;
1113        unsafe {
1114            let ptr = buf.as_mut_ptr();
1115            for i in 0..chunk.len() {
1116                let b = *ptr.add(i);
1117                if is_member(&squeeze_set, b) {
1118                    if last_squeezed == b as u16 {
1119                        continue;
1120                    }
1121                    last_squeezed = b as u16;
1122                } else {
1123                    last_squeezed = 256;
1124                }
1125                *ptr.add(wp) = b;
1126                wp += 1;
1127            }
1128        }
1129        writer.write_all(&buf[..wp])?;
1130    }
1131    Ok(())
1132}
1133
1134/// Delete from mmap'd byte slice.
1135///
1136/// For data >= 2MB: uses rayon parallel processing across multiple cores.
1137/// For data <= 16MB: delete into one buffer, one write syscall.
1138/// For data > 16MB: chunked approach to limit memory.
1139pub fn delete_mmap(delete_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1140    if delete_chars.len() == 1 {
1141        return delete_single_char_mmap(delete_chars[0], data, writer);
1142    }
1143    if delete_chars.len() <= 3 {
1144        return delete_multi_memchr_mmap(delete_chars, data, writer);
1145    }
1146
1147    let member = build_member_set(delete_chars);
1148
1149    // Parallel path: pre-allocate a single output buffer of data.len() and have each
1150    // thread write to its non-overlapping slice, then do a single write_all.
1151    // This avoids per-chunk Vec allocations that the old approach had.
1152    if data.len() >= PARALLEL_THRESHOLD {
1153        let n_threads = rayon::current_num_threads().max(1);
1154        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1155
1156        // Each thread deletes into its slice of outbuf and returns bytes written.
1157        let mut outbuf = alloc_uninit_vec(data.len());
1158        let chunk_lens: Vec<usize> = data
1159            .par_chunks(chunk_size)
1160            .zip(outbuf.par_chunks_mut(chunk_size))
1161            .map(|(src_chunk, dst_chunk)| delete_chunk_bitset_into(src_chunk, &member, dst_chunk))
1162            .collect();
1163
1164        // Compact: move each chunk's output to be contiguous.
1165        // chunk_lens[i] is how many bytes thread i wrote into its slice.
1166        // We need to shift them together since each dst_chunk started at chunk_size offsets.
1167        let mut write_pos = 0;
1168        let mut src_offset = 0;
1169        for &clen in &chunk_lens {
1170            if clen > 0 && src_offset != write_pos {
1171                unsafe {
1172                    std::ptr::copy(
1173                        outbuf.as_ptr().add(src_offset),
1174                        outbuf.as_mut_ptr().add(write_pos),
1175                        clen,
1176                    );
1177                }
1178            }
1179            write_pos += clen;
1180            src_offset += chunk_size;
1181        }
1182
1183        return writer.write_all(&outbuf[..write_pos]);
1184    }
1185
1186    // Single-write fast path: delete into one buffer, one write
1187    if data.len() <= SINGLE_WRITE_LIMIT {
1188        let mut outbuf = alloc_uninit_vec(data.len());
1189        let out_pos = delete_chunk_bitset_into(data, &member, &mut outbuf);
1190        return writer.write_all(&outbuf[..out_pos]);
1191    }
1192
1193    // Chunked path for large data
1194    let buf_size = data.len().min(BUF_SIZE);
1195    let mut outbuf = alloc_uninit_vec(buf_size);
1196
1197    for chunk in data.chunks(buf_size) {
1198        let out_pos = delete_chunk_bitset_into(chunk, &member, &mut outbuf);
1199        writer.write_all(&outbuf[..out_pos])?;
1200    }
1201    Ok(())
1202}
1203
1204/// Delete bytes from chunk using bitset, writing into pre-allocated buffer.
1205/// Returns number of bytes written.
1206#[inline]
1207fn delete_chunk_bitset_into(chunk: &[u8], member: &[u8; 32], outbuf: &mut [u8]) -> usize {
1208    let len = chunk.len();
1209    let mut out_pos = 0;
1210    let mut i = 0;
1211
1212    while i + 8 <= len {
1213        unsafe {
1214            let b0 = *chunk.get_unchecked(i);
1215            let b1 = *chunk.get_unchecked(i + 1);
1216            let b2 = *chunk.get_unchecked(i + 2);
1217            let b3 = *chunk.get_unchecked(i + 3);
1218            let b4 = *chunk.get_unchecked(i + 4);
1219            let b5 = *chunk.get_unchecked(i + 5);
1220            let b6 = *chunk.get_unchecked(i + 6);
1221            let b7 = *chunk.get_unchecked(i + 7);
1222
1223            *outbuf.get_unchecked_mut(out_pos) = b0;
1224            out_pos += !is_member(member, b0) as usize;
1225            *outbuf.get_unchecked_mut(out_pos) = b1;
1226            out_pos += !is_member(member, b1) as usize;
1227            *outbuf.get_unchecked_mut(out_pos) = b2;
1228            out_pos += !is_member(member, b2) as usize;
1229            *outbuf.get_unchecked_mut(out_pos) = b3;
1230            out_pos += !is_member(member, b3) as usize;
1231            *outbuf.get_unchecked_mut(out_pos) = b4;
1232            out_pos += !is_member(member, b4) as usize;
1233            *outbuf.get_unchecked_mut(out_pos) = b5;
1234            out_pos += !is_member(member, b5) as usize;
1235            *outbuf.get_unchecked_mut(out_pos) = b6;
1236            out_pos += !is_member(member, b6) as usize;
1237            *outbuf.get_unchecked_mut(out_pos) = b7;
1238            out_pos += !is_member(member, b7) as usize;
1239        }
1240        i += 8;
1241    }
1242
1243    while i < len {
1244        unsafe {
1245            let b = *chunk.get_unchecked(i);
1246            *outbuf.get_unchecked_mut(out_pos) = b;
1247            out_pos += !is_member(member, b) as usize;
1248        }
1249        i += 1;
1250    }
1251
1252    out_pos
1253}
1254
1255fn delete_single_char_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1256    // Parallel path for large data: each thread deletes from its chunk,
1257    // then use writev to write all results in one syscall batch.
1258    if data.len() >= PARALLEL_THRESHOLD {
1259        let n_threads = rayon::current_num_threads().max(1);
1260        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1261
1262        let results: Vec<Vec<u8>> = data
1263            .par_chunks(chunk_size)
1264            .map(|chunk| {
1265                let mut out = Vec::with_capacity(chunk.len());
1266                let mut last = 0;
1267                for pos in memchr::memchr_iter(ch, chunk) {
1268                    if pos > last {
1269                        out.extend_from_slice(&chunk[last..pos]);
1270                    }
1271                    last = pos + 1;
1272                }
1273                if last < chunk.len() {
1274                    out.extend_from_slice(&chunk[last..]);
1275                }
1276                out
1277            })
1278            .collect();
1279
1280        // Use writev to batch all results into fewer syscalls
1281        let slices: Vec<std::io::IoSlice> = results
1282            .iter()
1283            .filter(|r| !r.is_empty())
1284            .map(|r| std::io::IoSlice::new(r))
1285            .collect();
1286        return write_ioslices(writer, &slices);
1287    }
1288
1289    // Single-write fast path: collect all non-deleted spans into one buffer
1290    if data.len() <= SINGLE_WRITE_LIMIT {
1291        let mut outbuf = Vec::with_capacity(data.len());
1292        let mut last = 0;
1293        for pos in memchr::memchr_iter(ch, data) {
1294            if pos > last {
1295                outbuf.extend_from_slice(&data[last..pos]);
1296            }
1297            last = pos + 1;
1298        }
1299        if last < data.len() {
1300            outbuf.extend_from_slice(&data[last..]);
1301        }
1302        return writer.write_all(&outbuf);
1303    }
1304
1305    // Chunked path for large data
1306    let buf_size = data.len().min(BUF_SIZE);
1307    let mut outbuf = vec![0u8; buf_size];
1308
1309    for chunk in data.chunks(buf_size) {
1310        let mut wp = 0;
1311        let mut last = 0;
1312        for pos in memchr::memchr_iter(ch, chunk) {
1313            if pos > last {
1314                let run = pos - last;
1315                outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
1316                wp += run;
1317            }
1318            last = pos + 1;
1319        }
1320        if last < chunk.len() {
1321            let run = chunk.len() - last;
1322            outbuf[wp..wp + run].copy_from_slice(&chunk[last..]);
1323            wp += run;
1324        }
1325        writer.write_all(&outbuf[..wp])?;
1326    }
1327    Ok(())
1328}
1329
1330fn delete_multi_memchr_mmap(chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1331    let c0 = chars[0];
1332    let c1 = if chars.len() >= 2 { chars[1] } else { 0 };
1333    let c2 = if chars.len() >= 3 { chars[2] } else { 0 };
1334    let is_three = chars.len() >= 3;
1335
1336    // Parallel path for large data
1337    if data.len() >= PARALLEL_THRESHOLD {
1338        let n_threads = rayon::current_num_threads().max(1);
1339        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1340
1341        let results: Vec<Vec<u8>> = data
1342            .par_chunks(chunk_size)
1343            .map(|chunk| {
1344                let mut out = Vec::with_capacity(chunk.len());
1345                let mut last = 0;
1346                if is_three {
1347                    for pos in memchr::memchr3_iter(c0, c1, c2, chunk) {
1348                        if pos > last {
1349                            out.extend_from_slice(&chunk[last..pos]);
1350                        }
1351                        last = pos + 1;
1352                    }
1353                } else {
1354                    for pos in memchr::memchr2_iter(c0, c1, chunk) {
1355                        if pos > last {
1356                            out.extend_from_slice(&chunk[last..pos]);
1357                        }
1358                        last = pos + 1;
1359                    }
1360                }
1361                if last < chunk.len() {
1362                    out.extend_from_slice(&chunk[last..]);
1363                }
1364                out
1365            })
1366            .collect();
1367
1368        // Use writev to batch all results into fewer syscalls
1369        let slices: Vec<std::io::IoSlice> = results
1370            .iter()
1371            .filter(|r| !r.is_empty())
1372            .map(|r| std::io::IoSlice::new(r))
1373            .collect();
1374        return write_ioslices(writer, &slices);
1375    }
1376
1377    // Single-write fast path: collect all non-deleted spans into one buffer
1378    if data.len() <= SINGLE_WRITE_LIMIT {
1379        let mut outbuf = Vec::with_capacity(data.len());
1380        let mut last = 0;
1381        if is_three {
1382            for pos in memchr::memchr3_iter(c0, c1, c2, data) {
1383                if pos > last {
1384                    outbuf.extend_from_slice(&data[last..pos]);
1385                }
1386                last = pos + 1;
1387            }
1388        } else {
1389            for pos in memchr::memchr2_iter(c0, c1, data) {
1390                if pos > last {
1391                    outbuf.extend_from_slice(&data[last..pos]);
1392                }
1393                last = pos + 1;
1394            }
1395        }
1396        if last < data.len() {
1397            outbuf.extend_from_slice(&data[last..]);
1398        }
1399        return writer.write_all(&outbuf);
1400    }
1401
1402    // Chunked path for large data
1403    let buf_size = data.len().min(BUF_SIZE);
1404    let mut outbuf = vec![0u8; buf_size];
1405
1406    for chunk in data.chunks(buf_size) {
1407        let mut wp = 0;
1408        let mut last = 0;
1409
1410        // Iterate directly over memchr iterator without collecting into Vec<usize>.
1411        // Positions are used exactly once in order, so no intermediate allocation needed.
1412        if is_three {
1413            for pos in memchr::memchr3_iter(c0, c1, c2, chunk) {
1414                if pos > last {
1415                    let run = pos - last;
1416                    outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
1417                    wp += run;
1418                }
1419                last = pos + 1;
1420            }
1421        } else {
1422            for pos in memchr::memchr2_iter(c0, c1, chunk) {
1423                if pos > last {
1424                    let run = pos - last;
1425                    outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
1426                    wp += run;
1427                }
1428                last = pos + 1;
1429            }
1430        }
1431
1432        if last < chunk.len() {
1433            let run = chunk.len() - last;
1434            outbuf[wp..wp + run].copy_from_slice(&chunk[last..]);
1435            wp += run;
1436        }
1437        writer.write_all(&outbuf[..wp])?;
1438    }
1439    Ok(())
1440}
1441
1442/// Delete + squeeze from mmap'd byte slice.
1443///
1444/// For data <= 16MB: delete+squeeze into one buffer, one write syscall.
1445/// For data > 16MB: chunked approach to limit memory.
1446pub fn delete_squeeze_mmap(
1447    delete_chars: &[u8],
1448    squeeze_chars: &[u8],
1449    data: &[u8],
1450    writer: &mut impl Write,
1451) -> io::Result<()> {
1452    let delete_set = build_member_set(delete_chars);
1453    let squeeze_set = build_member_set(squeeze_chars);
1454
1455    // Single-write fast path: delete+squeeze all data in one pass, one write
1456    if data.len() <= SINGLE_WRITE_LIMIT {
1457        let mut outbuf: Vec<u8> = Vec::with_capacity(data.len());
1458        let mut last_squeezed: u16 = 256;
1459        unsafe {
1460            outbuf.set_len(data.len());
1461            let outp: *mut u8 = outbuf.as_mut_ptr();
1462            let inp = data.as_ptr();
1463            let len = data.len();
1464            let mut out_pos = 0;
1465            let mut i = 0;
1466            while i < len {
1467                let b = *inp.add(i);
1468                if is_member(&delete_set, b) {
1469                    i += 1;
1470                    continue;
1471                }
1472                if is_member(&squeeze_set, b) {
1473                    if last_squeezed == b as u16 {
1474                        i += 1;
1475                        continue;
1476                    }
1477                    last_squeezed = b as u16;
1478                } else {
1479                    last_squeezed = 256;
1480                }
1481                *outp.add(out_pos) = b;
1482                out_pos += 1;
1483                i += 1;
1484            }
1485            outbuf.set_len(out_pos);
1486        }
1487        return writer.write_all(&outbuf);
1488    }
1489
1490    // Chunked path for large data
1491    let buf_size = data.len().min(BUF_SIZE);
1492    let mut outbuf = vec![0u8; buf_size];
1493    let mut last_squeezed: u16 = 256;
1494
1495    for chunk in data.chunks(buf_size) {
1496        let mut out_pos = 0;
1497        for &b in chunk {
1498            if is_member(&delete_set, b) {
1499                continue;
1500            }
1501            if is_member(&squeeze_set, b) {
1502                if last_squeezed == b as u16 {
1503                    continue;
1504                }
1505                last_squeezed = b as u16;
1506            } else {
1507                last_squeezed = 256;
1508            }
1509            unsafe {
1510                *outbuf.get_unchecked_mut(out_pos) = b;
1511            }
1512            out_pos += 1;
1513        }
1514        writer.write_all(&outbuf[..out_pos])?;
1515    }
1516    Ok(())
1517}
1518
1519/// Squeeze from mmap'd byte slice.
1520///
1521/// For data >= 2MB: uses rayon parallel processing with boundary fixup.
1522/// For data <= 16MB: squeeze into one buffer, one write syscall.
1523/// For data > 16MB: chunked approach to limit memory.
1524pub fn squeeze_mmap(squeeze_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1525    if squeeze_chars.len() == 1 {
1526        return squeeze_single_mmap(squeeze_chars[0], data, writer);
1527    }
1528    if squeeze_chars.len() == 2 {
1529        return squeeze_multi_mmap::<2>(squeeze_chars, data, writer);
1530    }
1531    if squeeze_chars.len() == 3 {
1532        return squeeze_multi_mmap::<3>(squeeze_chars, data, writer);
1533    }
1534
1535    let member = build_member_set(squeeze_chars);
1536
1537    // Parallel path: squeeze each chunk independently, then fix boundaries
1538    if data.len() >= PARALLEL_THRESHOLD {
1539        let n_threads = rayon::current_num_threads().max(1);
1540        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1541
1542        let results: Vec<Vec<u8>> = data
1543            .par_chunks(chunk_size)
1544            .map(|chunk| squeeze_chunk_bitset(chunk, &member))
1545            .collect();
1546
1547        // Build IoSlice list, fixing boundaries: if chunk N ends with byte B
1548        // and chunk N+1 starts with same byte B, and B is in squeeze set,
1549        // skip the first byte(s) of chunk N+1 that equal B.
1550        // Collect slices for writev to minimize syscalls.
1551        let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
1552        for (idx, result) in results.iter().enumerate() {
1553            if result.is_empty() {
1554                continue;
1555            }
1556            if idx > 0 {
1557                // Check boundary: does previous chunk end with same squeezable byte?
1558                if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
1559                    if is_member(&member, prev_last) {
1560                        // Skip leading bytes in this chunk that equal prev_last
1561                        let skip = result.iter().take_while(|&&b| b == prev_last).count();
1562                        if skip < result.len() {
1563                            slices.push(std::io::IoSlice::new(&result[skip..]));
1564                        }
1565                        continue;
1566                    }
1567                }
1568            }
1569            slices.push(std::io::IoSlice::new(result));
1570        }
1571        return write_ioslices(writer, &slices);
1572    }
1573
1574    // Single-write fast path: squeeze all data into one buffer, one write
1575    if data.len() <= SINGLE_WRITE_LIMIT {
1576        let mut outbuf: Vec<u8> = Vec::with_capacity(data.len());
1577        let mut last_squeezed: u16 = 256;
1578        let len = data.len();
1579        let mut wp = 0;
1580        let mut i = 0;
1581
1582        unsafe {
1583            outbuf.set_len(data.len());
1584            let inp = data.as_ptr();
1585            let outp: *mut u8 = outbuf.as_mut_ptr();
1586
1587            while i < len {
1588                let b = *inp.add(i);
1589                if is_member(&member, b) {
1590                    if last_squeezed != b as u16 {
1591                        *outp.add(wp) = b;
1592                        wp += 1;
1593                        last_squeezed = b as u16;
1594                    }
1595                    i += 1;
1596                    while i < len && *inp.add(i) == b {
1597                        i += 1;
1598                    }
1599                } else {
1600                    last_squeezed = 256;
1601                    *outp.add(wp) = b;
1602                    wp += 1;
1603                    i += 1;
1604                }
1605            }
1606            outbuf.set_len(wp);
1607        }
1608        return writer.write_all(&outbuf);
1609    }
1610
1611    // Chunked path for large data
1612    let buf_size = data.len().min(BUF_SIZE);
1613    let mut outbuf = vec![0u8; buf_size];
1614    let mut last_squeezed: u16 = 256;
1615
1616    for chunk in data.chunks(buf_size) {
1617        let len = chunk.len();
1618        let mut wp = 0;
1619        let mut i = 0;
1620
1621        unsafe {
1622            let inp = chunk.as_ptr();
1623            let outp = outbuf.as_mut_ptr();
1624
1625            while i < len {
1626                let b = *inp.add(i);
1627                if is_member(&member, b) {
1628                    if last_squeezed != b as u16 {
1629                        *outp.add(wp) = b;
1630                        wp += 1;
1631                        last_squeezed = b as u16;
1632                    }
1633                    i += 1;
1634                    while i < len && *inp.add(i) == b {
1635                        i += 1;
1636                    }
1637                } else {
1638                    last_squeezed = 256;
1639                    *outp.add(wp) = b;
1640                    wp += 1;
1641                    i += 1;
1642                }
1643            }
1644        }
1645        writer.write_all(&outbuf[..wp])?;
1646    }
1647    Ok(())
1648}
1649
1650/// Squeeze a single chunk using bitset membership. Returns squeezed output.
1651fn squeeze_chunk_bitset(chunk: &[u8], member: &[u8; 32]) -> Vec<u8> {
1652    let len = chunk.len();
1653    let mut out = Vec::with_capacity(len);
1654    let mut last_squeezed: u16 = 256;
1655    let mut i = 0;
1656
1657    unsafe {
1658        out.set_len(len);
1659        let inp = chunk.as_ptr();
1660        let outp: *mut u8 = out.as_mut_ptr();
1661        let mut wp = 0;
1662
1663        while i < len {
1664            let b = *inp.add(i);
1665            if is_member(member, b) {
1666                if last_squeezed != b as u16 {
1667                    *outp.add(wp) = b;
1668                    wp += 1;
1669                    last_squeezed = b as u16;
1670                }
1671                i += 1;
1672                while i < len && *inp.add(i) == b {
1673                    i += 1;
1674                }
1675            } else {
1676                last_squeezed = 256;
1677                *outp.add(wp) = b;
1678                wp += 1;
1679                i += 1;
1680            }
1681        }
1682        out.set_len(wp);
1683    }
1684    out
1685}
1686
1687fn squeeze_multi_mmap<const N: usize>(
1688    chars: &[u8],
1689    data: &[u8],
1690    writer: &mut impl Write,
1691) -> io::Result<()> {
1692    // Parallel path for large data: squeeze each chunk, fix boundaries with writev
1693    if data.len() >= PARALLEL_THRESHOLD {
1694        let member = build_member_set(chars);
1695        let n_threads = rayon::current_num_threads().max(1);
1696        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1697
1698        let results: Vec<Vec<u8>> = data
1699            .par_chunks(chunk_size)
1700            .map(|chunk| squeeze_chunk_bitset(chunk, &member))
1701            .collect();
1702
1703        // Build IoSlice list, fixing boundaries
1704        let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
1705        for (idx, result) in results.iter().enumerate() {
1706            if result.is_empty() {
1707                continue;
1708            }
1709            if idx > 0 {
1710                if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
1711                    if is_member(&member, prev_last) {
1712                        let skip = result.iter().take_while(|&&b| b == prev_last).count();
1713                        if skip < result.len() {
1714                            slices.push(std::io::IoSlice::new(&result[skip..]));
1715                        }
1716                        continue;
1717                    }
1718                }
1719            }
1720            slices.push(std::io::IoSlice::new(result));
1721        }
1722        return write_ioslices(writer, &slices);
1723    }
1724
1725    let buf_size = data.len().min(BUF_SIZE);
1726    let mut outbuf = vec![0u8; buf_size];
1727    let mut wp = 0;
1728    let mut last_squeezed: u16 = 256;
1729    let mut cursor = 0;
1730
1731    macro_rules! find_next {
1732        ($data:expr) => {
1733            if N == 2 {
1734                memchr::memchr2(chars[0], chars[1], $data)
1735            } else {
1736                memchr::memchr3(chars[0], chars[1], chars[2], $data)
1737            }
1738        };
1739    }
1740
1741    macro_rules! flush_and_copy {
1742        ($src:expr, $len:expr) => {
1743            if wp + $len > buf_size {
1744                writer.write_all(&outbuf[..wp])?;
1745                wp = 0;
1746            }
1747            if $len > buf_size {
1748                writer.write_all($src)?;
1749            } else {
1750                outbuf[wp..wp + $len].copy_from_slice($src);
1751                wp += $len;
1752            }
1753        };
1754    }
1755
1756    while cursor < data.len() {
1757        match find_next!(&data[cursor..]) {
1758            Some(offset) => {
1759                let pos = cursor + offset;
1760                let b = data[pos];
1761                if pos > cursor {
1762                    let span = pos - cursor;
1763                    flush_and_copy!(&data[cursor..pos], span);
1764                    last_squeezed = 256;
1765                }
1766                if last_squeezed != b as u16 {
1767                    if wp >= buf_size {
1768                        writer.write_all(&outbuf[..wp])?;
1769                        wp = 0;
1770                    }
1771                    outbuf[wp] = b;
1772                    wp += 1;
1773                    last_squeezed = b as u16;
1774                }
1775                let mut skip = pos + 1;
1776                while skip < data.len() && data[skip] == b {
1777                    skip += 1;
1778                }
1779                cursor = skip;
1780            }
1781            None => {
1782                let remaining = data.len() - cursor;
1783                flush_and_copy!(&data[cursor..], remaining);
1784                break;
1785            }
1786        }
1787    }
1788    if wp > 0 {
1789        writer.write_all(&outbuf[..wp])?;
1790    }
1791    Ok(())
1792}
1793
1794fn squeeze_single_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1795    if data.is_empty() {
1796        return Ok(());
1797    }
1798
1799    if memchr::memmem::find(data, &[ch, ch]).is_none() {
1800        return writer.write_all(data);
1801    }
1802
1803    // Parallel path: squeeze each chunk, fix boundaries
1804    if data.len() >= PARALLEL_THRESHOLD {
1805        let n_threads = rayon::current_num_threads().max(1);
1806        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1807
1808        let results: Vec<Vec<u8>> = data
1809            .par_chunks(chunk_size)
1810            .map(|chunk| {
1811                let mut out = Vec::with_capacity(chunk.len());
1812                let mut cursor = 0;
1813                while cursor < chunk.len() {
1814                    match memchr::memchr(ch, &chunk[cursor..]) {
1815                        Some(offset) => {
1816                            let pos = cursor + offset;
1817                            if pos > cursor {
1818                                out.extend_from_slice(&chunk[cursor..pos]);
1819                            }
1820                            out.push(ch);
1821                            cursor = pos + 1;
1822                            while cursor < chunk.len() && chunk[cursor] == ch {
1823                                cursor += 1;
1824                            }
1825                        }
1826                        None => {
1827                            out.extend_from_slice(&chunk[cursor..]);
1828                            break;
1829                        }
1830                    }
1831                }
1832                out
1833            })
1834            .collect();
1835
1836        // Build IoSlice list, fixing boundary squeezability.
1837        // Use writev to minimize syscalls.
1838        let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
1839        for (idx, result) in results.iter().enumerate() {
1840            if result.is_empty() {
1841                continue;
1842            }
1843            if idx > 0 {
1844                if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
1845                    if prev_last == ch {
1846                        // Skip leading ch bytes in this chunk result
1847                        let skip = result.iter().take_while(|&&b| b == ch).count();
1848                        if skip < result.len() {
1849                            slices.push(std::io::IoSlice::new(&result[skip..]));
1850                        }
1851                        continue;
1852                    }
1853                }
1854            }
1855            slices.push(std::io::IoSlice::new(result));
1856        }
1857        return write_ioslices(writer, &slices);
1858    }
1859
1860    let buf_size = data.len().min(BUF_SIZE);
1861    let mut outbuf = vec![0u8; buf_size];
1862    let len = data.len();
1863    let mut wp = 0;
1864    let mut cursor = 0;
1865
1866    while cursor < len {
1867        match memchr::memchr(ch, &data[cursor..]) {
1868            Some(offset) => {
1869                let pos = cursor + offset;
1870                let gap = pos - cursor;
1871                if gap > 0 {
1872                    if wp + gap > buf_size {
1873                        writer.write_all(&outbuf[..wp])?;
1874                        wp = 0;
1875                    }
1876                    if gap > buf_size {
1877                        writer.write_all(&data[cursor..pos])?;
1878                    } else {
1879                        outbuf[wp..wp + gap].copy_from_slice(&data[cursor..pos]);
1880                        wp += gap;
1881                    }
1882                }
1883                if wp >= buf_size {
1884                    writer.write_all(&outbuf[..wp])?;
1885                    wp = 0;
1886                }
1887                outbuf[wp] = ch;
1888                wp += 1;
1889                cursor = pos + 1;
1890                while cursor < len && data[cursor] == ch {
1891                    cursor += 1;
1892                }
1893            }
1894            None => {
1895                let remaining = len - cursor;
1896                if remaining > 0 {
1897                    if wp + remaining > buf_size {
1898                        writer.write_all(&outbuf[..wp])?;
1899                        wp = 0;
1900                    }
1901                    if remaining > buf_size {
1902                        writer.write_all(&data[cursor..])?;
1903                    } else {
1904                        outbuf[wp..wp + remaining].copy_from_slice(&data[cursor..]);
1905                        wp += remaining;
1906                    }
1907                }
1908                break;
1909            }
1910        }
1911    }
1912
1913    if wp > 0 {
1914        writer.write_all(&outbuf[..wp])?;
1915    }
1916    Ok(())
1917}