Skip to main content

coreutils_rs/tr/
core.rs

1use std::io::{self, Read, Write};
2
3use rayon::prelude::*;
4
5/// Maximum IoSlice entries per write_vectored batch.
6/// Linux UIO_MAXIOV is 1024; we use that as our batch limit.
7const MAX_IOV: usize = 1024;
8
9/// Main processing buffer: 4MB (fits in L3 cache, avoids cache thrashing).
10const BUF_SIZE: usize = 4 * 1024 * 1024;
11
12/// Stream buffer: 4MB — all tr streaming operations (translate, delete, squeeze)
13/// are compute-light (single table lookup or bitset check per byte), so the
14/// bottleneck is I/O syscalls, not cache pressure. 4MB buffer means only
15/// 3 read()/write() syscall pairs for a 10MB input instead of 10+.
16/// This applies to ALL streaming modes (delete, squeeze, translate).
17const STREAM_BUF: usize = 4 * 1024 * 1024;
18
19/// Minimum data size to engage rayon parallel processing for mmap paths.
20/// Below this, single-threaded is faster due to thread pool overhead.
21const PARALLEL_THRESHOLD: usize = 2 * 1024 * 1024;
22
23/// Write multiple IoSlice buffers using write_vectored, batching into MAX_IOV-sized groups.
24/// Falls back to write_all per slice for partial writes.
25#[inline]
26fn write_ioslices(writer: &mut impl Write, slices: &[std::io::IoSlice]) -> io::Result<()> {
27    if slices.is_empty() {
28        return Ok(());
29    }
30    for batch in slices.chunks(MAX_IOV) {
31        let total: usize = batch.iter().map(|s| s.len()).sum();
32        match writer.write_vectored(batch) {
33            Ok(n) if n >= total => continue,
34            Ok(mut written) => {
35                // Partial write: fall back to write_all per remaining slice
36                for slice in batch {
37                    let slen = slice.len();
38                    if written >= slen {
39                        written -= slen;
40                        continue;
41                    }
42                    if written > 0 {
43                        writer.write_all(&slice[written..])?;
44                        written = 0;
45                    } else {
46                        writer.write_all(slice)?;
47                    }
48                }
49            }
50            Err(e) => return Err(e),
51        }
52    }
53    Ok(())
54}
55
56/// Allocate a Vec<u8> of given length without zero-initialization.
57/// SAFETY: Caller must write all bytes before reading them.
58#[inline]
59#[allow(clippy::uninit_vec)]
60fn alloc_uninit_vec(len: usize) -> Vec<u8> {
61    let mut v = Vec::with_capacity(len);
62    // SAFETY: u8 has no drop, no invalid bit patterns; caller will overwrite before reading
63    unsafe {
64        v.set_len(len);
65    }
66    v
67}
68
69/// Build a 256-byte lookup table mapping set1[i] -> set2[i].
70#[inline]
71fn build_translate_table(set1: &[u8], set2: &[u8]) -> [u8; 256] {
72    let mut table: [u8; 256] = std::array::from_fn(|i| i as u8);
73    let last = set2.last().copied();
74    for (i, &from) in set1.iter().enumerate() {
75        table[from as usize] = if i < set2.len() {
76            set2[i]
77        } else {
78            last.unwrap_or(from)
79        };
80    }
81    table
82}
83
84/// Build a 256-bit (32-byte) membership set for O(1) byte lookup.
85#[inline]
86fn build_member_set(chars: &[u8]) -> [u8; 32] {
87    let mut set = [0u8; 32];
88    for &ch in chars {
89        set[ch as usize >> 3] |= 1 << (ch & 7);
90    }
91    set
92}
93
94#[inline(always)]
95fn is_member(set: &[u8; 32], ch: u8) -> bool {
96    unsafe { (*set.get_unchecked(ch as usize >> 3) & (1 << (ch & 7))) != 0 }
97}
98
99/// Translate bytes in-place using a 256-byte lookup table.
100/// On x86_64 with SSSE3+, uses SIMD pshufb-based nibble decomposition for
101/// ~32 bytes per iteration. Falls back to 8x-unrolled scalar on other platforms.
102#[inline(always)]
103fn translate_inplace(data: &mut [u8], table: &[u8; 256]) {
104    #[cfg(target_arch = "x86_64")]
105    {
106        if is_x86_feature_detected!("avx2") {
107            unsafe { translate_inplace_avx2_table(data, table) };
108            return;
109        }
110        if is_x86_feature_detected!("ssse3") {
111            unsafe { translate_inplace_ssse3_table(data, table) };
112            return;
113        }
114    }
115    translate_inplace_scalar(data, table);
116}
117
118/// Scalar fallback: 8x-unrolled table lookup.
119#[inline(always)]
120fn translate_inplace_scalar(data: &mut [u8], table: &[u8; 256]) {
121    let len = data.len();
122    let ptr = data.as_mut_ptr();
123    let mut i = 0;
124    unsafe {
125        while i + 8 <= len {
126            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
127            *ptr.add(i + 1) = *table.get_unchecked(*ptr.add(i + 1) as usize);
128            *ptr.add(i + 2) = *table.get_unchecked(*ptr.add(i + 2) as usize);
129            *ptr.add(i + 3) = *table.get_unchecked(*ptr.add(i + 3) as usize);
130            *ptr.add(i + 4) = *table.get_unchecked(*ptr.add(i + 4) as usize);
131            *ptr.add(i + 5) = *table.get_unchecked(*ptr.add(i + 5) as usize);
132            *ptr.add(i + 6) = *table.get_unchecked(*ptr.add(i + 6) as usize);
133            *ptr.add(i + 7) = *table.get_unchecked(*ptr.add(i + 7) as usize);
134            i += 8;
135        }
136        while i < len {
137            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
138            i += 1;
139        }
140    }
141}
142
143// ============================================================================
144// SIMD arbitrary table lookup using pshufb nibble decomposition (x86_64)
145// ============================================================================
146//
147// For an arbitrary 256-byte lookup table, we decompose each byte into
148// high nibble (bits 7-4) and low nibble (bits 3-0). We pre-build 16
149// SIMD vectors, one for each high nibble value h (0..15), containing
150// the 16 table entries table[h*16+0..h*16+15]. Then for each input
151// vector we:
152//   1. Extract low nibble (AND 0x0F) -> used as pshufb index
153//   2. Extract high nibble (shift right 4) -> used to select which table
154//   3. For each of the 16 high nibble values, create a mask where
155//      the high nibble equals that value, pshufb the corresponding
156//      table, and accumulate results
157//
158// AVX2 processes 32 bytes/iteration; SSSE3 processes 16 bytes/iteration.
159// With instruction-level parallelism, this achieves much higher throughput
160// than scalar table lookups which have serial data dependencies.
161
162#[cfg(target_arch = "x86_64")]
163#[target_feature(enable = "avx2")]
164unsafe fn translate_inplace_avx2_table(data: &mut [u8], table: &[u8; 256]) {
165    use std::arch::x86_64::*;
166
167    unsafe {
168        let len = data.len();
169        let ptr = data.as_mut_ptr();
170
171        // Pre-build 16 lookup vectors, one per high nibble value.
172        // Each vector holds 32 bytes = 2x 128-bit lanes, each lane has the same
173        // 16 table entries for pshufb indexing by low nibble.
174        let mut lut = [_mm256_setzero_si256(); 16];
175        for h in 0u8..16 {
176            let base = (h as usize) * 16;
177            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
178            // Broadcast the 128-bit row to both lanes of the 256-bit vector
179            let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
180            lut[h as usize] = _mm256_broadcastsi128_si256(row128);
181        }
182
183        let lo_mask = _mm256_set1_epi8(0x0F);
184
185        let mut i = 0;
186        while i + 32 <= len {
187            let input = _mm256_loadu_si256(ptr.add(i) as *const _);
188            let lo_nibble = _mm256_and_si256(input, lo_mask);
189            let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
190
191            // Accumulate result: for each high nibble value h, select bytes where
192            // hi_nibble == h, look up in lut[h] using lo_nibble, blend into result.
193            let mut result = _mm256_setzero_si256();
194
195            // Unroll the 16 high-nibble iterations for best ILP
196            macro_rules! do_nibble {
197                ($h:expr) => {
198                    let h_val = _mm256_set1_epi8($h as i8);
199                    let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
200                    let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
201                    result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
202                };
203            }
204            do_nibble!(0);
205            do_nibble!(1);
206            do_nibble!(2);
207            do_nibble!(3);
208            do_nibble!(4);
209            do_nibble!(5);
210            do_nibble!(6);
211            do_nibble!(7);
212            do_nibble!(8);
213            do_nibble!(9);
214            do_nibble!(10);
215            do_nibble!(11);
216            do_nibble!(12);
217            do_nibble!(13);
218            do_nibble!(14);
219            do_nibble!(15);
220
221            _mm256_storeu_si256(ptr.add(i) as *mut _, result);
222            i += 32;
223        }
224
225        // SSE/SSSE3 tail for remaining 16-byte chunk
226        if i + 16 <= len {
227            let lo_mask128 = _mm_set1_epi8(0x0F);
228
229            // Build 128-bit LUTs (just the lower lane of each 256-bit LUT)
230            let mut lut128 = [_mm_setzero_si128(); 16];
231            for h in 0u8..16 {
232                lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
233            }
234
235            let input = _mm_loadu_si128(ptr.add(i) as *const _);
236            let lo_nib = _mm_and_si128(input, lo_mask128);
237            let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
238
239            let mut res = _mm_setzero_si128();
240            macro_rules! do_nibble128 {
241                ($h:expr) => {
242                    let h_val = _mm_set1_epi8($h as i8);
243                    let mask = _mm_cmpeq_epi8(hi_nib, h_val);
244                    let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
245                    res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
246                };
247            }
248            do_nibble128!(0);
249            do_nibble128!(1);
250            do_nibble128!(2);
251            do_nibble128!(3);
252            do_nibble128!(4);
253            do_nibble128!(5);
254            do_nibble128!(6);
255            do_nibble128!(7);
256            do_nibble128!(8);
257            do_nibble128!(9);
258            do_nibble128!(10);
259            do_nibble128!(11);
260            do_nibble128!(12);
261            do_nibble128!(13);
262            do_nibble128!(14);
263            do_nibble128!(15);
264
265            _mm_storeu_si128(ptr.add(i) as *mut _, res);
266            i += 16;
267        }
268
269        // Scalar tail
270        while i < len {
271            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
272            i += 1;
273        }
274    }
275}
276
277#[cfg(target_arch = "x86_64")]
278#[target_feature(enable = "ssse3")]
279unsafe fn translate_inplace_ssse3_table(data: &mut [u8], table: &[u8; 256]) {
280    use std::arch::x86_64::*;
281
282    unsafe {
283        let len = data.len();
284        let ptr = data.as_mut_ptr();
285
286        // Pre-build 16 lookup vectors for pshufb
287        let mut lut = [_mm_setzero_si128(); 16];
288        for h in 0u8..16 {
289            let base = (h as usize) * 16;
290            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
291            lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
292        }
293
294        let lo_mask = _mm_set1_epi8(0x0F);
295
296        let mut i = 0;
297        while i + 16 <= len {
298            let input = _mm_loadu_si128(ptr.add(i) as *const _);
299            let lo_nibble = _mm_and_si128(input, lo_mask);
300            let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
301
302            let mut result = _mm_setzero_si128();
303
304            macro_rules! do_nibble {
305                ($h:expr) => {
306                    let h_val = _mm_set1_epi8($h as i8);
307                    let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
308                    let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
309                    result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
310                };
311            }
312            do_nibble!(0);
313            do_nibble!(1);
314            do_nibble!(2);
315            do_nibble!(3);
316            do_nibble!(4);
317            do_nibble!(5);
318            do_nibble!(6);
319            do_nibble!(7);
320            do_nibble!(8);
321            do_nibble!(9);
322            do_nibble!(10);
323            do_nibble!(11);
324            do_nibble!(12);
325            do_nibble!(13);
326            do_nibble!(14);
327            do_nibble!(15);
328
329            _mm_storeu_si128(ptr.add(i) as *mut _, result);
330            i += 16;
331        }
332
333        // Scalar tail
334        while i < len {
335            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
336            i += 1;
337        }
338    }
339}
340
341/// Translate bytes from source to destination using a 256-byte lookup table.
342/// On x86_64 with SSSE3+, uses SIMD pshufb-based nibble decomposition.
343#[inline(always)]
344fn translate_to(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
345    debug_assert!(dst.len() >= src.len());
346    #[cfg(target_arch = "x86_64")]
347    {
348        if is_x86_feature_detected!("avx2") {
349            unsafe { translate_to_avx2_table(src, dst, table) };
350            return;
351        }
352        if is_x86_feature_detected!("ssse3") {
353            unsafe { translate_to_ssse3_table(src, dst, table) };
354            return;
355        }
356    }
357    translate_to_scalar(src, dst, table);
358}
359
360/// Scalar fallback for translate_to.
361#[inline(always)]
362fn translate_to_scalar(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
363    unsafe {
364        let sp = src.as_ptr();
365        let dp = dst.as_mut_ptr();
366        let len = src.len();
367        let mut i = 0;
368        while i + 8 <= len {
369            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
370            *dp.add(i + 1) = *table.get_unchecked(*sp.add(i + 1) as usize);
371            *dp.add(i + 2) = *table.get_unchecked(*sp.add(i + 2) as usize);
372            *dp.add(i + 3) = *table.get_unchecked(*sp.add(i + 3) as usize);
373            *dp.add(i + 4) = *table.get_unchecked(*sp.add(i + 4) as usize);
374            *dp.add(i + 5) = *table.get_unchecked(*sp.add(i + 5) as usize);
375            *dp.add(i + 6) = *table.get_unchecked(*sp.add(i + 6) as usize);
376            *dp.add(i + 7) = *table.get_unchecked(*sp.add(i + 7) as usize);
377            i += 8;
378        }
379        while i < len {
380            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
381            i += 1;
382        }
383    }
384}
385
386#[cfg(target_arch = "x86_64")]
387#[target_feature(enable = "avx2")]
388unsafe fn translate_to_avx2_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
389    use std::arch::x86_64::*;
390
391    unsafe {
392        let len = src.len();
393        let sp = src.as_ptr();
394        let dp = dst.as_mut_ptr();
395
396        // Pre-build 16 lookup vectors
397        let mut lut = [_mm256_setzero_si256(); 16];
398        for h in 0u8..16 {
399            let base = (h as usize) * 16;
400            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
401            let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
402            lut[h as usize] = _mm256_broadcastsi128_si256(row128);
403        }
404
405        let lo_mask = _mm256_set1_epi8(0x0F);
406
407        let mut i = 0;
408        while i + 32 <= len {
409            let input = _mm256_loadu_si256(sp.add(i) as *const _);
410            let lo_nibble = _mm256_and_si256(input, lo_mask);
411            let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
412
413            let mut result = _mm256_setzero_si256();
414
415            macro_rules! do_nibble {
416                ($h:expr) => {
417                    let h_val = _mm256_set1_epi8($h as i8);
418                    let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
419                    let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
420                    result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
421                };
422            }
423            do_nibble!(0);
424            do_nibble!(1);
425            do_nibble!(2);
426            do_nibble!(3);
427            do_nibble!(4);
428            do_nibble!(5);
429            do_nibble!(6);
430            do_nibble!(7);
431            do_nibble!(8);
432            do_nibble!(9);
433            do_nibble!(10);
434            do_nibble!(11);
435            do_nibble!(12);
436            do_nibble!(13);
437            do_nibble!(14);
438            do_nibble!(15);
439
440            _mm256_storeu_si256(dp.add(i) as *mut _, result);
441            i += 32;
442        }
443
444        // SSSE3 tail for remaining 16-byte chunk
445        if i + 16 <= len {
446            let lo_mask128 = _mm_set1_epi8(0x0F);
447            let mut lut128 = [_mm_setzero_si128(); 16];
448            for h in 0u8..16 {
449                lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
450            }
451
452            let input = _mm_loadu_si128(sp.add(i) as *const _);
453            let lo_nib = _mm_and_si128(input, lo_mask128);
454            let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
455
456            let mut res = _mm_setzero_si128();
457            macro_rules! do_nibble128 {
458                ($h:expr) => {
459                    let h_val = _mm_set1_epi8($h as i8);
460                    let mask = _mm_cmpeq_epi8(hi_nib, h_val);
461                    let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
462                    res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
463                };
464            }
465            do_nibble128!(0);
466            do_nibble128!(1);
467            do_nibble128!(2);
468            do_nibble128!(3);
469            do_nibble128!(4);
470            do_nibble128!(5);
471            do_nibble128!(6);
472            do_nibble128!(7);
473            do_nibble128!(8);
474            do_nibble128!(9);
475            do_nibble128!(10);
476            do_nibble128!(11);
477            do_nibble128!(12);
478            do_nibble128!(13);
479            do_nibble128!(14);
480            do_nibble128!(15);
481
482            _mm_storeu_si128(dp.add(i) as *mut _, res);
483            i += 16;
484        }
485
486        // Scalar tail
487        while i < len {
488            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
489            i += 1;
490        }
491    }
492}
493
494#[cfg(target_arch = "x86_64")]
495#[target_feature(enable = "ssse3")]
496unsafe fn translate_to_ssse3_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
497    use std::arch::x86_64::*;
498
499    unsafe {
500        let len = src.len();
501        let sp = src.as_ptr();
502        let dp = dst.as_mut_ptr();
503
504        let mut lut = [_mm_setzero_si128(); 16];
505        for h in 0u8..16 {
506            let base = (h as usize) * 16;
507            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
508            lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
509        }
510
511        let lo_mask = _mm_set1_epi8(0x0F);
512
513        let mut i = 0;
514        while i + 16 <= len {
515            let input = _mm_loadu_si128(sp.add(i) as *const _);
516            let lo_nibble = _mm_and_si128(input, lo_mask);
517            let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
518
519            let mut result = _mm_setzero_si128();
520
521            macro_rules! do_nibble {
522                ($h:expr) => {
523                    let h_val = _mm_set1_epi8($h as i8);
524                    let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
525                    let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
526                    result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
527                };
528            }
529            do_nibble!(0);
530            do_nibble!(1);
531            do_nibble!(2);
532            do_nibble!(3);
533            do_nibble!(4);
534            do_nibble!(5);
535            do_nibble!(6);
536            do_nibble!(7);
537            do_nibble!(8);
538            do_nibble!(9);
539            do_nibble!(10);
540            do_nibble!(11);
541            do_nibble!(12);
542            do_nibble!(13);
543            do_nibble!(14);
544            do_nibble!(15);
545
546            _mm_storeu_si128(dp.add(i) as *mut _, result);
547            i += 16;
548        }
549
550        // Scalar tail
551        while i < len {
552            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
553            i += 1;
554        }
555    }
556}
557
558// ============================================================================
559// SIMD range translation (x86_64)
560// ============================================================================
561
562/// Detect if the translate table is a single contiguous range with constant offset.
563/// Returns Some((lo, hi, offset)) if all non-identity entries form [lo..=hi] with
564/// table[i] = i + offset for all i in [lo, hi].
565#[inline]
566fn detect_range_offset(table: &[u8; 256]) -> Option<(u8, u8, i8)> {
567    let mut lo: Option<u8> = None;
568    let mut hi = 0u8;
569    let mut offset = 0i16;
570
571    for i in 0..256 {
572        if table[i] != i as u8 {
573            let diff = table[i] as i16 - i as i16;
574            match lo {
575                None => {
576                    lo = Some(i as u8);
577                    hi = i as u8;
578                    offset = diff;
579                }
580                Some(_) => {
581                    if diff != offset || i as u8 != hi.wrapping_add(1) {
582                        return None;
583                    }
584                    hi = i as u8;
585                }
586            }
587        }
588    }
589
590    lo.map(|l| (l, hi, offset as i8))
591}
592
593/// SIMD-accelerated range translation for mmap'd data.
594/// For tables where only a contiguous range [lo..=hi] is translated by a constant offset,
595/// uses AVX2 (32 bytes/iter) or SSE2 (16 bytes/iter) vectorized arithmetic.
596#[cfg(target_arch = "x86_64")]
597fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
598    if is_x86_feature_detected!("avx2") {
599        unsafe { translate_range_avx2(src, dst, lo, hi, offset) };
600    } else {
601        unsafe { translate_range_sse2(src, dst, lo, hi, offset) };
602    }
603}
604
605#[cfg(target_arch = "x86_64")]
606#[target_feature(enable = "avx2")]
607unsafe fn translate_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
608    use std::arch::x86_64::*;
609
610    unsafe {
611        let range = hi - lo;
612        // Bias: shift range so lo maps to -128 (signed min).
613        // For input in [lo, hi]: biased = input + (0x80 - lo) is in [-128, -128+range].
614        // For input < lo: biased wraps to large positive (signed), > threshold.
615        // For input > hi: biased > -128+range, > threshold.
616        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
617        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
618        let offset_v = _mm256_set1_epi8(offset);
619        let zero = _mm256_setzero_si256();
620
621        let len = src.len();
622        let mut i = 0;
623
624        while i + 32 <= len {
625            let input = _mm256_loadu_si256(src.as_ptr().add(i) as *const _);
626            let biased = _mm256_add_epi8(input, bias_v);
627            // gt = 0xFF where biased > threshold (OUT of range)
628            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
629            // mask = 0xFF where IN range (NOT gt)
630            let mask = _mm256_cmpeq_epi8(gt, zero);
631            let offset_masked = _mm256_and_si256(mask, offset_v);
632            let result = _mm256_add_epi8(input, offset_masked);
633            _mm256_storeu_si256(dst.as_mut_ptr().add(i) as *mut _, result);
634            i += 32;
635        }
636
637        // SSE2 tail for 16-byte remainder
638        if i + 16 <= len {
639            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
640            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
641            let offset_v128 = _mm_set1_epi8(offset);
642            let zero128 = _mm_setzero_si128();
643
644            let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
645            let biased = _mm_add_epi8(input, bias_v128);
646            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
647            let mask = _mm_cmpeq_epi8(gt, zero128);
648            let offset_masked = _mm_and_si128(mask, offset_v128);
649            let result = _mm_add_epi8(input, offset_masked);
650            _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
651            i += 16;
652        }
653
654        // Scalar tail
655        while i < len {
656            let b = *src.get_unchecked(i);
657            *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
658                b.wrapping_add(offset as u8)
659            } else {
660                b
661            };
662            i += 1;
663        }
664    }
665}
666
667#[cfg(target_arch = "x86_64")]
668#[target_feature(enable = "sse2")]
669unsafe fn translate_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
670    use std::arch::x86_64::*;
671
672    unsafe {
673        let range = hi - lo;
674        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
675        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
676        let offset_v = _mm_set1_epi8(offset);
677        let zero = _mm_setzero_si128();
678
679        let len = src.len();
680        let mut i = 0;
681
682        while i + 16 <= len {
683            let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
684            let biased = _mm_add_epi8(input, bias_v);
685            let gt = _mm_cmpgt_epi8(biased, threshold_v);
686            let mask = _mm_cmpeq_epi8(gt, zero);
687            let offset_masked = _mm_and_si128(mask, offset_v);
688            let result = _mm_add_epi8(input, offset_masked);
689            _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
690            i += 16;
691        }
692
693        while i < len {
694            let b = *src.get_unchecked(i);
695            *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
696                b.wrapping_add(offset as u8)
697            } else {
698                b
699            };
700            i += 1;
701        }
702    }
703}
704
705/// Scalar range translation fallback for non-x86_64.
706#[cfg(not(target_arch = "x86_64"))]
707fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
708    for (i, &b) in src.iter().enumerate() {
709        dst[i] = if b >= lo && b <= hi {
710            b.wrapping_add(offset as u8)
711        } else {
712            b
713        };
714    }
715}
716
717// ============================================================================
718// In-place SIMD range translation (saves one buffer allocation in streaming)
719// ============================================================================
720
721/// In-place SIMD-accelerated range translation.
722/// Translates bytes in [lo..=hi] by adding `offset`, leaving others unchanged.
723/// Operates on the buffer in-place, eliminating the need for a separate output buffer.
724#[cfg(target_arch = "x86_64")]
725fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
726    if is_x86_feature_detected!("avx2") {
727        unsafe { translate_range_avx2_inplace(data, lo, hi, offset) };
728    } else {
729        unsafe { translate_range_sse2_inplace(data, lo, hi, offset) };
730    }
731}
732
733#[cfg(target_arch = "x86_64")]
734#[target_feature(enable = "avx2")]
735unsafe fn translate_range_avx2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
736    use std::arch::x86_64::*;
737
738    unsafe {
739        let range = hi - lo;
740        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
741        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
742        let offset_v = _mm256_set1_epi8(offset);
743        let zero = _mm256_setzero_si256();
744
745        let len = data.len();
746        let ptr = data.as_mut_ptr();
747        let mut i = 0;
748
749        while i + 32 <= len {
750            let input = _mm256_loadu_si256(ptr.add(i) as *const _);
751            let biased = _mm256_add_epi8(input, bias_v);
752            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
753            let mask = _mm256_cmpeq_epi8(gt, zero);
754            let offset_masked = _mm256_and_si256(mask, offset_v);
755            let result = _mm256_add_epi8(input, offset_masked);
756            _mm256_storeu_si256(ptr.add(i) as *mut _, result);
757            i += 32;
758        }
759
760        if i + 16 <= len {
761            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
762            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
763            let offset_v128 = _mm_set1_epi8(offset);
764            let zero128 = _mm_setzero_si128();
765
766            let input = _mm_loadu_si128(ptr.add(i) as *const _);
767            let biased = _mm_add_epi8(input, bias_v128);
768            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
769            let mask = _mm_cmpeq_epi8(gt, zero128);
770            let offset_masked = _mm_and_si128(mask, offset_v128);
771            let result = _mm_add_epi8(input, offset_masked);
772            _mm_storeu_si128(ptr.add(i) as *mut _, result);
773            i += 16;
774        }
775
776        while i < len {
777            let b = *ptr.add(i);
778            *ptr.add(i) = if b >= lo && b <= hi {
779                b.wrapping_add(offset as u8)
780            } else {
781                b
782            };
783            i += 1;
784        }
785    }
786}
787
788#[cfg(target_arch = "x86_64")]
789#[target_feature(enable = "sse2")]
790unsafe fn translate_range_sse2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
791    use std::arch::x86_64::*;
792
793    unsafe {
794        let range = hi - lo;
795        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
796        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
797        let offset_v = _mm_set1_epi8(offset);
798        let zero = _mm_setzero_si128();
799
800        let len = data.len();
801        let ptr = data.as_mut_ptr();
802        let mut i = 0;
803
804        while i + 16 <= len {
805            let input = _mm_loadu_si128(ptr.add(i) as *const _);
806            let biased = _mm_add_epi8(input, bias_v);
807            let gt = _mm_cmpgt_epi8(biased, threshold_v);
808            let mask = _mm_cmpeq_epi8(gt, zero);
809            let offset_masked = _mm_and_si128(mask, offset_v);
810            let result = _mm_add_epi8(input, offset_masked);
811            _mm_storeu_si128(ptr.add(i) as *mut _, result);
812            i += 16;
813        }
814
815        while i < len {
816            let b = *ptr.add(i);
817            *ptr.add(i) = if b >= lo && b <= hi {
818                b.wrapping_add(offset as u8)
819            } else {
820                b
821            };
822            i += 1;
823        }
824    }
825}
826
827#[cfg(not(target_arch = "x86_64"))]
828fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
829    for b in data.iter_mut() {
830        if *b >= lo && *b <= hi {
831            *b = b.wrapping_add(offset as u8);
832        }
833    }
834}
835
836// ============================================================================
837// SIMD range deletion (x86_64)
838// ============================================================================
839
840/// Detect if ALL delete characters form a single contiguous byte range [lo..=hi].
841/// Returns Some((lo, hi)) if so. This is true for common classes:
842/// - `[:digit:]` = 0x30..=0x39
843/// - `a-z` = 0x61..=0x7A
844/// - `A-Z` = 0x41..=0x5A
845#[inline]
846fn detect_delete_range(chars: &[u8]) -> Option<(u8, u8)> {
847    if chars.is_empty() {
848        return None;
849    }
850    let mut lo = chars[0];
851    let mut hi = chars[0];
852    for &c in &chars[1..] {
853        if c < lo {
854            lo = c;
855        }
856        if c > hi {
857            hi = c;
858        }
859    }
860    // Check that the range size matches the number of chars (no gaps)
861    if (hi - lo + 1) as usize == chars.len() {
862        Some((lo, hi))
863    } else {
864        None
865    }
866}
867
868/// SIMD-accelerated delete for contiguous byte ranges.
869/// Uses the same bias+threshold trick as range translate to identify bytes in [lo..=hi],
870/// then compacts output by skipping matched bytes.
871#[cfg(target_arch = "x86_64")]
872fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
873    if is_x86_feature_detected!("avx2") {
874        unsafe { delete_range_avx2(src, dst, lo, hi) }
875    } else {
876        unsafe { delete_range_sse2(src, dst, lo, hi) }
877    }
878}
879
880#[cfg(target_arch = "x86_64")]
881#[target_feature(enable = "avx2")]
882unsafe fn delete_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
883    use std::arch::x86_64::*;
884
885    unsafe {
886        let range = hi - lo;
887        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
888        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
889        let zero = _mm256_setzero_si256();
890
891        let len = src.len();
892        let sp = src.as_ptr();
893        let dp = dst.as_mut_ptr();
894        let mut ri = 0;
895        let mut wp = 0;
896
897        while ri + 32 <= len {
898            let input = _mm256_loadu_si256(sp.add(ri) as *const _);
899            let biased = _mm256_add_epi8(input, bias_v);
900            // gt = 0xFF where biased > threshold (OUT of range = KEEP)
901            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
902            // in_range = 0xFF where IN range (to DELETE), 0 where to KEEP
903            let in_range = _mm256_cmpeq_epi8(gt, zero);
904            // keep_mask bits: 1 = keep (NOT in range)
905            let keep_mask = !(_mm256_movemask_epi8(in_range) as u32);
906
907            // Compact: copy kept bytes using the mask
908            let mut mask = keep_mask;
909            while mask != 0 {
910                let bit = mask.trailing_zeros() as usize;
911                *dp.add(wp) = *sp.add(ri + bit);
912                wp += 1;
913                mask &= mask - 1; // clear lowest set bit
914            }
915            ri += 32;
916        }
917
918        // SSE2 tail for 16-byte remainder
919        if ri + 16 <= len {
920            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
921            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
922            let zero128 = _mm_setzero_si128();
923
924            let input = _mm_loadu_si128(sp.add(ri) as *const _);
925            let biased = _mm_add_epi8(input, bias_v128);
926            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
927            let in_range = _mm_cmpeq_epi8(gt, zero128);
928            let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
929
930            let mut mask = keep_mask;
931            while mask != 0 {
932                let bit = mask.trailing_zeros() as usize;
933                *dp.add(wp) = *sp.add(ri + bit);
934                wp += 1;
935                mask &= mask - 1;
936            }
937            ri += 16;
938        }
939
940        // Scalar tail
941        while ri < len {
942            let b = *sp.add(ri);
943            if b < lo || b > hi {
944                *dp.add(wp) = b;
945                wp += 1;
946            }
947            ri += 1;
948        }
949
950        wp
951    }
952}
953
954#[cfg(target_arch = "x86_64")]
955#[target_feature(enable = "sse2")]
956unsafe fn delete_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
957    use std::arch::x86_64::*;
958
959    unsafe {
960        let range = hi - lo;
961        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
962        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
963        let zero = _mm_setzero_si128();
964
965        let len = src.len();
966        let sp = src.as_ptr();
967        let dp = dst.as_mut_ptr();
968        let mut ri = 0;
969        let mut wp = 0;
970
971        while ri + 16 <= len {
972            let input = _mm_loadu_si128(sp.add(ri) as *const _);
973            let biased = _mm_add_epi8(input, bias_v);
974            let gt = _mm_cmpgt_epi8(biased, threshold_v);
975            let in_range = _mm_cmpeq_epi8(gt, zero);
976            let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
977
978            let mut mask = keep_mask;
979            while mask != 0 {
980                let bit = mask.trailing_zeros() as usize;
981                *dp.add(wp) = *sp.add(ri + bit);
982                wp += 1;
983                mask &= mask - 1;
984            }
985            ri += 16;
986        }
987
988        while ri < len {
989            let b = *sp.add(ri);
990            if b < lo || b > hi {
991                *dp.add(wp) = b;
992                wp += 1;
993            }
994            ri += 1;
995        }
996
997        wp
998    }
999}
1000
1001/// Scalar range delete fallback for non-x86_64.
1002#[cfg(not(target_arch = "x86_64"))]
1003fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
1004    let mut wp = 0;
1005    for &b in src {
1006        if b < lo || b > hi {
1007            dst[wp] = b;
1008            wp += 1;
1009        }
1010    }
1011    wp
1012}
1013
1014/// Streaming delete for contiguous byte ranges using SIMD range detection.
1015/// Uses 4MB buffer to reduce syscalls (delete is compute-light, I/O bound).
1016fn delete_range_streaming(
1017    lo: u8,
1018    hi: u8,
1019    reader: &mut impl Read,
1020    writer: &mut impl Write,
1021) -> io::Result<()> {
1022    let mut src = vec![0u8; STREAM_BUF];
1023    let mut dst = alloc_uninit_vec(STREAM_BUF);
1024    loop {
1025        let n = read_full(reader, &mut src)?;
1026        if n == 0 {
1027            break;
1028        }
1029        let wp = delete_range_chunk(&src[..n], &mut dst, lo, hi);
1030        if wp > 0 {
1031            writer.write_all(&dst[..wp])?;
1032        }
1033    }
1034    Ok(())
1035}
1036
1037// ============================================================================
1038// Streaming functions (Read + Write)
1039// ============================================================================
1040
1041pub fn translate(
1042    set1: &[u8],
1043    set2: &[u8],
1044    reader: &mut impl Read,
1045    writer: &mut impl Write,
1046) -> io::Result<()> {
1047    let table = build_translate_table(set1, set2);
1048
1049    // Check for identity table — pure passthrough (no transformation needed)
1050    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
1051    if is_identity {
1052        return passthrough_stream(reader, writer);
1053    }
1054
1055    // Try SIMD fast path for range translations (in-place, single buffer)
1056    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
1057        return translate_range_stream(lo, hi, offset, reader, writer);
1058    }
1059
1060    // General case: IN-PLACE translation on a SINGLE 4MB buffer.
1061    // This halves memory bandwidth vs the old separate src/dst approach:
1062    // - Old: read into src (4MB), translate from src→dst (read 4MB + write 4MB), write dst (4MB) = 12MB
1063    // - New: read into buf (4MB), translate in-place (read+write 4MB), write buf (4MB) = 8MB
1064    // The 8x-unrolled in-place translate avoids store-to-load forwarding stalls
1065    // because consecutive reads are 8 bytes apart (sequential), not aliased.
1066    // Using 4MB buffer (vs 1MB) reduces syscall count from 10 to 3 for 10MB.
1067    let mut buf = vec![0u8; STREAM_BUF];
1068    loop {
1069        let n = read_full(reader, &mut buf)?;
1070        if n == 0 {
1071            break;
1072        }
1073        translate_inplace(&mut buf[..n], &table);
1074        writer.write_all(&buf[..n])?;
1075    }
1076    Ok(())
1077}
1078
1079/// Streaming SIMD range translation — single buffer, in-place transform.
1080/// Uses 4MB buffer for fewer syscalls (translate is compute-light).
1081fn translate_range_stream(
1082    lo: u8,
1083    hi: u8,
1084    offset: i8,
1085    reader: &mut impl Read,
1086    writer: &mut impl Write,
1087) -> io::Result<()> {
1088    let mut buf = vec![0u8; STREAM_BUF];
1089    loop {
1090        let n = read_full(reader, &mut buf)?;
1091        if n == 0 {
1092            break;
1093        }
1094        translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
1095        writer.write_all(&buf[..n])?;
1096    }
1097    Ok(())
1098}
1099
1100/// Pure passthrough: copy stdin to stdout without transformation.
1101/// Uses a single 4MB buffer with direct read/write, no processing overhead.
1102fn passthrough_stream(reader: &mut impl Read, writer: &mut impl Write) -> io::Result<()> {
1103    let mut buf = vec![0u8; STREAM_BUF];
1104    loop {
1105        let n = read_full(reader, &mut buf)?;
1106        if n == 0 {
1107            break;
1108        }
1109        writer.write_all(&buf[..n])?;
1110    }
1111    Ok(())
1112}
1113
1114/// Read as many bytes as possible into buf, retrying on partial reads.
1115/// Fast path: first read() often fills the entire buffer for regular files.
1116#[inline]
1117fn read_full(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
1118    // Fast path: first read() usually fills the entire buffer for regular files
1119    let n = reader.read(buf)?;
1120    if n == buf.len() || n == 0 {
1121        return Ok(n);
1122    }
1123    // Slow path: partial read — retry to fill buffer (pipes, slow devices)
1124    let mut total = n;
1125    while total < buf.len() {
1126        match reader.read(&mut buf[total..]) {
1127            Ok(0) => break,
1128            Ok(n) => total += n,
1129            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
1130            Err(e) => return Err(e),
1131        }
1132    }
1133    Ok(total)
1134}
1135
1136pub fn translate_squeeze(
1137    set1: &[u8],
1138    set2: &[u8],
1139    reader: &mut impl Read,
1140    writer: &mut impl Write,
1141) -> io::Result<()> {
1142    let table = build_translate_table(set1, set2);
1143    let squeeze_set = build_member_set(set2);
1144
1145    // Two-pass optimization for range translations:
1146    // Pass 1: SIMD range translate in-place (10x faster than scalar table lookup)
1147    // Pass 2: scalar squeeze (inherently sequential due to state dependency)
1148    // Even though it's two passes, the translate pass is so much faster with SIMD
1149    // that the total is still a net win.
1150    let range_info = detect_range_offset(&table);
1151
1152    let mut buf = vec![0u8; STREAM_BUF];
1153    let mut last_squeezed: u16 = 256;
1154
1155    loop {
1156        let n = read_full(reader, &mut buf)?;
1157        if n == 0 {
1158            break;
1159        }
1160        // Pass 1: translate
1161        if let Some((lo, hi, offset)) = range_info {
1162            translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
1163        } else {
1164            translate_inplace(&mut buf[..n], &table);
1165        }
1166        // Pass 2: squeeze in-place
1167        let mut wp = 0;
1168        unsafe {
1169            let ptr = buf.as_mut_ptr();
1170            for i in 0..n {
1171                let b = *ptr.add(i);
1172                if is_member(&squeeze_set, b) {
1173                    if last_squeezed == b as u16 {
1174                        continue;
1175                    }
1176                    last_squeezed = b as u16;
1177                } else {
1178                    last_squeezed = 256;
1179                }
1180                *ptr.add(wp) = b;
1181                wp += 1;
1182            }
1183        }
1184        writer.write_all(&buf[..wp])?;
1185    }
1186    Ok(())
1187}
1188
1189pub fn delete(
1190    delete_chars: &[u8],
1191    reader: &mut impl Read,
1192    writer: &mut impl Write,
1193) -> io::Result<()> {
1194    if delete_chars.len() == 1 {
1195        return delete_single_streaming(delete_chars[0], reader, writer);
1196    }
1197    if delete_chars.len() <= 3 {
1198        return delete_multi_streaming(delete_chars, reader, writer);
1199    }
1200
1201    // SIMD fast path: if all delete chars form a contiguous range [lo..=hi],
1202    // use vectorized range comparison instead of scalar bitset lookup.
1203    // This covers [:digit:] (0x30-0x39), a-z, A-Z, etc.
1204    if let Some((lo, hi)) = detect_delete_range(delete_chars) {
1205        return delete_range_streaming(lo, hi, reader, writer);
1206    }
1207
1208    let member = build_member_set(delete_chars);
1209    let mut buf = vec![0u8; STREAM_BUF];
1210
1211    loop {
1212        let n = read_full(reader, &mut buf)?;
1213        if n == 0 {
1214            break;
1215        }
1216        let mut wp = 0;
1217        unsafe {
1218            let ptr = buf.as_mut_ptr();
1219            let mut i = 0;
1220            while i + 8 <= n {
1221                let b0 = *ptr.add(i);
1222                let b1 = *ptr.add(i + 1);
1223                let b2 = *ptr.add(i + 2);
1224                let b3 = *ptr.add(i + 3);
1225                let b4 = *ptr.add(i + 4);
1226                let b5 = *ptr.add(i + 5);
1227                let b6 = *ptr.add(i + 6);
1228                let b7 = *ptr.add(i + 7);
1229
1230                // Branchless: write byte then conditionally advance pointer.
1231                // Avoids branch mispredictions when most bytes are kept.
1232                *ptr.add(wp) = b0;
1233                wp += !is_member(&member, b0) as usize;
1234                *ptr.add(wp) = b1;
1235                wp += !is_member(&member, b1) as usize;
1236                *ptr.add(wp) = b2;
1237                wp += !is_member(&member, b2) as usize;
1238                *ptr.add(wp) = b3;
1239                wp += !is_member(&member, b3) as usize;
1240                *ptr.add(wp) = b4;
1241                wp += !is_member(&member, b4) as usize;
1242                *ptr.add(wp) = b5;
1243                wp += !is_member(&member, b5) as usize;
1244                *ptr.add(wp) = b6;
1245                wp += !is_member(&member, b6) as usize;
1246                *ptr.add(wp) = b7;
1247                wp += !is_member(&member, b7) as usize;
1248                i += 8;
1249            }
1250            while i < n {
1251                let b = *ptr.add(i);
1252                *ptr.add(wp) = b;
1253                wp += !is_member(&member, b) as usize;
1254                i += 1;
1255            }
1256        }
1257        writer.write_all(&buf[..wp])?;
1258    }
1259    Ok(())
1260}
1261
1262fn delete_single_streaming(
1263    ch: u8,
1264    reader: &mut impl Read,
1265    writer: &mut impl Write,
1266) -> io::Result<()> {
1267    let mut buf = vec![0u8; STREAM_BUF];
1268    loop {
1269        let n = read_full(reader, &mut buf)?;
1270        if n == 0 {
1271            break;
1272        }
1273        let mut wp = 0;
1274        let mut i = 0;
1275        while i < n {
1276            match memchr::memchr(ch, &buf[i..n]) {
1277                Some(offset) => {
1278                    if offset > 0 {
1279                        if wp != i {
1280                            unsafe {
1281                                std::ptr::copy(
1282                                    buf.as_ptr().add(i),
1283                                    buf.as_mut_ptr().add(wp),
1284                                    offset,
1285                                );
1286                            }
1287                        }
1288                        wp += offset;
1289                    }
1290                    i += offset + 1;
1291                }
1292                None => {
1293                    let run_len = n - i;
1294                    if run_len > 0 {
1295                        if wp != i {
1296                            unsafe {
1297                                std::ptr::copy(
1298                                    buf.as_ptr().add(i),
1299                                    buf.as_mut_ptr().add(wp),
1300                                    run_len,
1301                                );
1302                            }
1303                        }
1304                        wp += run_len;
1305                    }
1306                    break;
1307                }
1308            }
1309        }
1310        writer.write_all(&buf[..wp])?;
1311    }
1312    Ok(())
1313}
1314
1315fn delete_multi_streaming(
1316    chars: &[u8],
1317    reader: &mut impl Read,
1318    writer: &mut impl Write,
1319) -> io::Result<()> {
1320    let mut buf = vec![0u8; STREAM_BUF];
1321    loop {
1322        let n = read_full(reader, &mut buf)?;
1323        if n == 0 {
1324            break;
1325        }
1326        let mut wp = 0;
1327        let mut i = 0;
1328        while i < n {
1329            let found = if chars.len() == 2 {
1330                memchr::memchr2(chars[0], chars[1], &buf[i..n])
1331            } else {
1332                memchr::memchr3(chars[0], chars[1], chars[2], &buf[i..n])
1333            };
1334            match found {
1335                Some(offset) => {
1336                    if offset > 0 {
1337                        if wp != i {
1338                            unsafe {
1339                                std::ptr::copy(
1340                                    buf.as_ptr().add(i),
1341                                    buf.as_mut_ptr().add(wp),
1342                                    offset,
1343                                );
1344                            }
1345                        }
1346                        wp += offset;
1347                    }
1348                    i += offset + 1;
1349                }
1350                None => {
1351                    let run_len = n - i;
1352                    if run_len > 0 {
1353                        if wp != i {
1354                            unsafe {
1355                                std::ptr::copy(
1356                                    buf.as_ptr().add(i),
1357                                    buf.as_mut_ptr().add(wp),
1358                                    run_len,
1359                                );
1360                            }
1361                        }
1362                        wp += run_len;
1363                    }
1364                    break;
1365                }
1366            }
1367        }
1368        writer.write_all(&buf[..wp])?;
1369    }
1370    Ok(())
1371}
1372
1373pub fn delete_squeeze(
1374    delete_chars: &[u8],
1375    squeeze_chars: &[u8],
1376    reader: &mut impl Read,
1377    writer: &mut impl Write,
1378) -> io::Result<()> {
1379    let delete_set = build_member_set(delete_chars);
1380    let squeeze_set = build_member_set(squeeze_chars);
1381    let mut buf = vec![0u8; STREAM_BUF];
1382    let mut last_squeezed: u16 = 256;
1383
1384    loop {
1385        let n = read_full(reader, &mut buf)?;
1386        if n == 0 {
1387            break;
1388        }
1389        let mut wp = 0;
1390        unsafe {
1391            let ptr = buf.as_mut_ptr();
1392            for i in 0..n {
1393                let b = *ptr.add(i);
1394                if is_member(&delete_set, b) {
1395                    continue;
1396                }
1397                if is_member(&squeeze_set, b) {
1398                    if last_squeezed == b as u16 {
1399                        continue;
1400                    }
1401                    last_squeezed = b as u16;
1402                } else {
1403                    last_squeezed = 256;
1404                }
1405                *ptr.add(wp) = b;
1406                wp += 1;
1407            }
1408        }
1409        writer.write_all(&buf[..wp])?;
1410    }
1411    Ok(())
1412}
1413
1414pub fn squeeze(
1415    squeeze_chars: &[u8],
1416    reader: &mut impl Read,
1417    writer: &mut impl Write,
1418) -> io::Result<()> {
1419    if squeeze_chars.len() == 1 {
1420        return squeeze_single_stream(squeeze_chars[0], reader, writer);
1421    }
1422
1423    let member = build_member_set(squeeze_chars);
1424    let mut buf = vec![0u8; STREAM_BUF];
1425    let mut last_squeezed: u16 = 256;
1426
1427    loop {
1428        let n = read_full(reader, &mut buf)?;
1429        if n == 0 {
1430            break;
1431        }
1432        let mut wp = 0;
1433        unsafe {
1434            let ptr = buf.as_mut_ptr();
1435            for i in 0..n {
1436                let b = *ptr.add(i);
1437                if is_member(&member, b) {
1438                    if last_squeezed == b as u16 {
1439                        continue;
1440                    }
1441                    last_squeezed = b as u16;
1442                } else {
1443                    last_squeezed = 256;
1444                }
1445                *ptr.add(wp) = b;
1446                wp += 1;
1447            }
1448        }
1449        writer.write_all(&buf[..wp])?;
1450    }
1451    Ok(())
1452}
1453
1454fn squeeze_single_stream(
1455    ch: u8,
1456    reader: &mut impl Read,
1457    writer: &mut impl Write,
1458) -> io::Result<()> {
1459    let mut buf = vec![0u8; STREAM_BUF];
1460    let mut was_squeeze_char = false;
1461
1462    loop {
1463        let n = read_full(reader, &mut buf)?;
1464        if n == 0 {
1465            break;
1466        }
1467
1468        let mut wp = 0;
1469        let mut i = 0;
1470
1471        while i < n {
1472            if was_squeeze_char && buf[i] == ch {
1473                i += 1;
1474                while i < n && buf[i] == ch {
1475                    i += 1;
1476                }
1477                if i >= n {
1478                    break;
1479                }
1480            }
1481
1482            match memchr::memchr(ch, &buf[i..n]) {
1483                Some(offset) => {
1484                    let run_len = offset;
1485                    if run_len > 0 {
1486                        if wp != i {
1487                            unsafe {
1488                                std::ptr::copy(
1489                                    buf.as_ptr().add(i),
1490                                    buf.as_mut_ptr().add(wp),
1491                                    run_len,
1492                                );
1493                            }
1494                        }
1495                        wp += run_len;
1496                    }
1497                    i += run_len;
1498
1499                    unsafe {
1500                        *buf.as_mut_ptr().add(wp) = ch;
1501                    }
1502                    wp += 1;
1503                    was_squeeze_char = true;
1504                    i += 1;
1505                    while i < n && buf[i] == ch {
1506                        i += 1;
1507                    }
1508                }
1509                None => {
1510                    let run_len = n - i;
1511                    if run_len > 0 {
1512                        if wp != i {
1513                            unsafe {
1514                                std::ptr::copy(
1515                                    buf.as_ptr().add(i),
1516                                    buf.as_mut_ptr().add(wp),
1517                                    run_len,
1518                                );
1519                            }
1520                        }
1521                        wp += run_len;
1522                    }
1523                    was_squeeze_char = false;
1524                    break;
1525                }
1526            }
1527        }
1528
1529        writer.write_all(&buf[..wp])?;
1530    }
1531    Ok(())
1532}
1533
1534// ============================================================================
1535// Mmap-based functions (zero-copy input from byte slice)
1536// ============================================================================
1537
1538/// Maximum data size for single-allocation translate approach.
1539/// Below this limit, translate ALL data into one buffer and do a single write_all.
1540/// Above this, use chunked approach to limit memory usage.
1541const SINGLE_WRITE_LIMIT: usize = 16 * 1024 * 1024;
1542
1543/// Translate bytes from an mmap'd byte slice.
1544/// Detects single-range translations (e.g., a-z to A-Z) and uses SIMD vectorized
1545/// arithmetic (AVX2: 32 bytes/iter, SSE2: 16 bytes/iter) for those cases.
1546/// Falls back to scalar 256-byte table lookup for general translations.
1547///
1548/// For data >= 2MB: uses rayon parallel processing across multiple cores.
1549/// For data <= 16MB: single allocation + single write_all (1 syscall).
1550/// For data > 16MB: chunked approach to limit memory (N syscalls where N = data/4MB).
1551pub fn translate_mmap(
1552    set1: &[u8],
1553    set2: &[u8],
1554    data: &[u8],
1555    writer: &mut impl Write,
1556) -> io::Result<()> {
1557    let table = build_translate_table(set1, set2);
1558
1559    // Check if table is identity — pure passthrough
1560    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
1561    if is_identity {
1562        return writer.write_all(data);
1563    }
1564
1565    // Try SIMD fast path for single-range constant-offset translations
1566    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
1567        return translate_mmap_range(data, writer, lo, hi, offset);
1568    }
1569
1570    // General case: table lookup (with parallel processing for large data)
1571    translate_mmap_table(data, writer, &table)
1572}
1573
1574/// SIMD range translate for mmap data, with rayon parallel processing.
1575fn translate_mmap_range(
1576    data: &[u8],
1577    writer: &mut impl Write,
1578    lo: u8,
1579    hi: u8,
1580    offset: i8,
1581) -> io::Result<()> {
1582    // Parallel path: split data into chunks, translate each in parallel
1583    if data.len() >= PARALLEL_THRESHOLD {
1584        let mut buf = alloc_uninit_vec(data.len());
1585        let n_threads = rayon::current_num_threads().max(1);
1586        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1587
1588        // Process chunks in parallel: each thread writes to its slice of buf
1589        data.par_chunks(chunk_size)
1590            .zip(buf.par_chunks_mut(chunk_size))
1591            .for_each(|(src_chunk, dst_chunk)| {
1592                translate_range_simd(src_chunk, &mut dst_chunk[..src_chunk.len()], lo, hi, offset);
1593            });
1594
1595        return writer.write_all(&buf);
1596    }
1597
1598    // Small data: single-threaded SIMD
1599    if data.len() <= SINGLE_WRITE_LIMIT {
1600        let mut buf = alloc_uninit_vec(data.len());
1601        translate_range_simd(data, &mut buf, lo, hi, offset);
1602        return writer.write_all(&buf);
1603    }
1604    // Chunked path for large data (shouldn't happen since PARALLEL_THRESHOLD < SINGLE_WRITE_LIMIT)
1605    let mut buf = alloc_uninit_vec(BUF_SIZE);
1606    for chunk in data.chunks(BUF_SIZE) {
1607        translate_range_simd(chunk, &mut buf[..chunk.len()], lo, hi, offset);
1608        writer.write_all(&buf[..chunk.len()])?;
1609    }
1610    Ok(())
1611}
1612
1613/// General table-lookup translate for mmap data, with rayon parallel processing.
1614fn translate_mmap_table(data: &[u8], writer: &mut impl Write, table: &[u8; 256]) -> io::Result<()> {
1615    // Parallel path: split data into chunks, translate each in parallel
1616    if data.len() >= PARALLEL_THRESHOLD {
1617        let mut buf = alloc_uninit_vec(data.len());
1618        let n_threads = rayon::current_num_threads().max(1);
1619        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1620
1621        data.par_chunks(chunk_size)
1622            .zip(buf.par_chunks_mut(chunk_size))
1623            .for_each(|(src_chunk, dst_chunk)| {
1624                translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], table);
1625            });
1626
1627        return writer.write_all(&buf);
1628    }
1629
1630    // Small data: single-threaded
1631    if data.len() <= SINGLE_WRITE_LIMIT {
1632        let mut buf = alloc_uninit_vec(data.len());
1633        translate_to(data, &mut buf, table);
1634        return writer.write_all(&buf);
1635    }
1636    let mut buf = alloc_uninit_vec(BUF_SIZE);
1637    for chunk in data.chunks(BUF_SIZE) {
1638        translate_to(chunk, &mut buf[..chunk.len()], table);
1639        writer.write_all(&buf[..chunk.len()])?;
1640    }
1641    Ok(())
1642}
1643
1644/// Translate + squeeze from mmap'd byte slice.
1645///
1646/// For data >= 2MB: two-phase approach: parallel translate, then sequential squeeze.
1647/// For data <= 16MB: single-pass translate+squeeze into one buffer, one write syscall.
1648/// For data > 16MB: chunked approach to limit memory.
1649pub fn translate_squeeze_mmap(
1650    set1: &[u8],
1651    set2: &[u8],
1652    data: &[u8],
1653    writer: &mut impl Write,
1654) -> io::Result<()> {
1655    let table = build_translate_table(set1, set2);
1656    let squeeze_set = build_member_set(set2);
1657
1658    // For large data: two-phase approach
1659    // Phase 1: parallel translate into buffer
1660    // Phase 2: sequential squeeze IN-PLACE on the translated buffer
1661    //          (squeeze only removes bytes, never grows, so no second allocation needed)
1662    if data.len() >= PARALLEL_THRESHOLD {
1663        // Phase 1: parallel translate
1664        let mut translated = alloc_uninit_vec(data.len());
1665        let range_info = detect_range_offset(&table);
1666        let n_threads = rayon::current_num_threads().max(1);
1667        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1668
1669        if let Some((lo, hi, offset)) = range_info {
1670            data.par_chunks(chunk_size)
1671                .zip(translated.par_chunks_mut(chunk_size))
1672                .for_each(|(src_chunk, dst_chunk)| {
1673                    translate_range_simd(
1674                        src_chunk,
1675                        &mut dst_chunk[..src_chunk.len()],
1676                        lo,
1677                        hi,
1678                        offset,
1679                    );
1680                });
1681        } else {
1682            data.par_chunks(chunk_size)
1683                .zip(translated.par_chunks_mut(chunk_size))
1684                .for_each(|(src_chunk, dst_chunk)| {
1685                    translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], &table);
1686                });
1687        }
1688
1689        // Phase 2: squeeze in-place on the translated buffer.
1690        // Since squeeze only removes bytes (never grows), we can read ahead and
1691        // compact into the same buffer, saving a full data.len() heap allocation.
1692        let mut last_squeezed: u16 = 256;
1693        let len = translated.len();
1694        let mut wp = 0;
1695        unsafe {
1696            let ptr = translated.as_mut_ptr();
1697            let mut i = 0;
1698            while i < len {
1699                let b = *ptr.add(i);
1700                if is_member(&squeeze_set, b) {
1701                    if last_squeezed == b as u16 {
1702                        i += 1;
1703                        continue;
1704                    }
1705                    last_squeezed = b as u16;
1706                } else {
1707                    last_squeezed = 256;
1708                }
1709                *ptr.add(wp) = b;
1710                wp += 1;
1711                i += 1;
1712            }
1713        }
1714        return writer.write_all(&translated[..wp]);
1715    }
1716
1717    // Single-write fast path: translate+squeeze all data in one pass, one write
1718    if data.len() <= SINGLE_WRITE_LIMIT {
1719        let mut buf: Vec<u8> = Vec::with_capacity(data.len());
1720        let mut last_squeezed: u16 = 256;
1721        unsafe {
1722            buf.set_len(data.len());
1723            let outp: *mut u8 = buf.as_mut_ptr();
1724            let inp = data.as_ptr();
1725            let len = data.len();
1726            let mut wp = 0;
1727            let mut i = 0;
1728            while i < len {
1729                let translated = *table.get_unchecked(*inp.add(i) as usize);
1730                if is_member(&squeeze_set, translated) {
1731                    if last_squeezed == translated as u16 {
1732                        i += 1;
1733                        continue;
1734                    }
1735                    last_squeezed = translated as u16;
1736                } else {
1737                    last_squeezed = 256;
1738                }
1739                *outp.add(wp) = translated;
1740                wp += 1;
1741                i += 1;
1742            }
1743            buf.set_len(wp);
1744        }
1745        return writer.write_all(&buf);
1746    }
1747
1748    // Chunked path for large data
1749    let buf_size = data.len().min(BUF_SIZE);
1750    let mut buf = vec![0u8; buf_size];
1751    let mut last_squeezed: u16 = 256;
1752
1753    for chunk in data.chunks(buf_size) {
1754        translate_to(chunk, &mut buf[..chunk.len()], &table);
1755        let mut wp = 0;
1756        unsafe {
1757            let ptr = buf.as_mut_ptr();
1758            for i in 0..chunk.len() {
1759                let b = *ptr.add(i);
1760                if is_member(&squeeze_set, b) {
1761                    if last_squeezed == b as u16 {
1762                        continue;
1763                    }
1764                    last_squeezed = b as u16;
1765                } else {
1766                    last_squeezed = 256;
1767                }
1768                *ptr.add(wp) = b;
1769                wp += 1;
1770            }
1771        }
1772        writer.write_all(&buf[..wp])?;
1773    }
1774    Ok(())
1775}
1776
1777/// Delete from mmap'd byte slice.
1778///
1779/// For data >= 2MB: uses rayon parallel processing across multiple cores.
1780/// For data <= 16MB: delete into one buffer, one write syscall.
1781/// For data > 16MB: chunked approach to limit memory.
1782pub fn delete_mmap(delete_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1783    if delete_chars.len() == 1 {
1784        return delete_single_char_mmap(delete_chars[0], data, writer);
1785    }
1786    if delete_chars.len() <= 3 {
1787        return delete_multi_memchr_mmap(delete_chars, data, writer);
1788    }
1789
1790    // SIMD fast path for contiguous ranges (digits, a-z, A-Z, etc.)
1791    if let Some((lo, hi)) = detect_delete_range(delete_chars) {
1792        return delete_range_mmap(data, writer, lo, hi);
1793    }
1794
1795    let member = build_member_set(delete_chars);
1796
1797    // Parallel path: pre-allocate a single output buffer of data.len() and have each
1798    // thread write to its non-overlapping slice, then do a single write_all.
1799    // This avoids per-chunk Vec allocations that the old approach had.
1800    if data.len() >= PARALLEL_THRESHOLD {
1801        let n_threads = rayon::current_num_threads().max(1);
1802        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1803
1804        // Each thread deletes into its slice of outbuf and returns bytes written.
1805        let mut outbuf = alloc_uninit_vec(data.len());
1806        let chunk_lens: Vec<usize> = data
1807            .par_chunks(chunk_size)
1808            .zip(outbuf.par_chunks_mut(chunk_size))
1809            .map(|(src_chunk, dst_chunk)| delete_chunk_bitset_into(src_chunk, &member, dst_chunk))
1810            .collect();
1811
1812        // Compact: move each chunk's output to be contiguous.
1813        // chunk_lens[i] is how many bytes thread i wrote into its slice.
1814        // We need to shift them together since each dst_chunk started at chunk_size offsets.
1815        let mut write_pos = 0;
1816        let mut src_offset = 0;
1817        for &clen in &chunk_lens {
1818            if clen > 0 && src_offset != write_pos {
1819                unsafe {
1820                    std::ptr::copy(
1821                        outbuf.as_ptr().add(src_offset),
1822                        outbuf.as_mut_ptr().add(write_pos),
1823                        clen,
1824                    );
1825                }
1826            }
1827            write_pos += clen;
1828            src_offset += chunk_size;
1829        }
1830
1831        return writer.write_all(&outbuf[..write_pos]);
1832    }
1833
1834    // Single-write fast path: delete into one buffer, one write
1835    if data.len() <= SINGLE_WRITE_LIMIT {
1836        let mut outbuf = alloc_uninit_vec(data.len());
1837        let out_pos = delete_chunk_bitset_into(data, &member, &mut outbuf);
1838        return writer.write_all(&outbuf[..out_pos]);
1839    }
1840
1841    // Chunked path for large data
1842    let buf_size = data.len().min(BUF_SIZE);
1843    let mut outbuf = alloc_uninit_vec(buf_size);
1844
1845    for chunk in data.chunks(buf_size) {
1846        let out_pos = delete_chunk_bitset_into(chunk, &member, &mut outbuf);
1847        writer.write_all(&outbuf[..out_pos])?;
1848    }
1849    Ok(())
1850}
1851
1852/// SIMD range delete for mmap data, with rayon parallel processing.
1853fn delete_range_mmap(data: &[u8], writer: &mut impl Write, lo: u8, hi: u8) -> io::Result<()> {
1854    // Parallel path: each thread deletes from its chunk into a local Vec
1855    if data.len() >= PARALLEL_THRESHOLD {
1856        let n_threads = rayon::current_num_threads().max(1);
1857        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1858
1859        let results: Vec<Vec<u8>> = data
1860            .par_chunks(chunk_size)
1861            .map(|chunk| {
1862                let mut out = alloc_uninit_vec(chunk.len());
1863                let wp = delete_range_chunk(chunk, &mut out, lo, hi);
1864                unsafe { out.set_len(wp) };
1865                out
1866            })
1867            .collect();
1868
1869        let slices: Vec<std::io::IoSlice> = results
1870            .iter()
1871            .filter(|r| !r.is_empty())
1872            .map(|r| std::io::IoSlice::new(r))
1873            .collect();
1874        return write_ioslices(writer, &slices);
1875    }
1876
1877    // Single-write fast path
1878    if data.len() <= SINGLE_WRITE_LIMIT {
1879        let mut outbuf = alloc_uninit_vec(data.len());
1880        let wp = delete_range_chunk(data, &mut outbuf, lo, hi);
1881        return writer.write_all(&outbuf[..wp]);
1882    }
1883
1884    // Chunked path
1885    let mut outbuf = alloc_uninit_vec(BUF_SIZE);
1886    for chunk in data.chunks(BUF_SIZE) {
1887        let wp = delete_range_chunk(chunk, &mut outbuf, lo, hi);
1888        writer.write_all(&outbuf[..wp])?;
1889    }
1890    Ok(())
1891}
1892
1893/// Delete bytes from chunk using bitset, writing into pre-allocated buffer.
1894/// Returns number of bytes written.
1895#[inline]
1896fn delete_chunk_bitset_into(chunk: &[u8], member: &[u8; 32], outbuf: &mut [u8]) -> usize {
1897    let len = chunk.len();
1898    let mut out_pos = 0;
1899    let mut i = 0;
1900
1901    while i + 8 <= len {
1902        unsafe {
1903            let b0 = *chunk.get_unchecked(i);
1904            let b1 = *chunk.get_unchecked(i + 1);
1905            let b2 = *chunk.get_unchecked(i + 2);
1906            let b3 = *chunk.get_unchecked(i + 3);
1907            let b4 = *chunk.get_unchecked(i + 4);
1908            let b5 = *chunk.get_unchecked(i + 5);
1909            let b6 = *chunk.get_unchecked(i + 6);
1910            let b7 = *chunk.get_unchecked(i + 7);
1911
1912            *outbuf.get_unchecked_mut(out_pos) = b0;
1913            out_pos += !is_member(member, b0) as usize;
1914            *outbuf.get_unchecked_mut(out_pos) = b1;
1915            out_pos += !is_member(member, b1) as usize;
1916            *outbuf.get_unchecked_mut(out_pos) = b2;
1917            out_pos += !is_member(member, b2) as usize;
1918            *outbuf.get_unchecked_mut(out_pos) = b3;
1919            out_pos += !is_member(member, b3) as usize;
1920            *outbuf.get_unchecked_mut(out_pos) = b4;
1921            out_pos += !is_member(member, b4) as usize;
1922            *outbuf.get_unchecked_mut(out_pos) = b5;
1923            out_pos += !is_member(member, b5) as usize;
1924            *outbuf.get_unchecked_mut(out_pos) = b6;
1925            out_pos += !is_member(member, b6) as usize;
1926            *outbuf.get_unchecked_mut(out_pos) = b7;
1927            out_pos += !is_member(member, b7) as usize;
1928        }
1929        i += 8;
1930    }
1931
1932    while i < len {
1933        unsafe {
1934            let b = *chunk.get_unchecked(i);
1935            *outbuf.get_unchecked_mut(out_pos) = b;
1936            out_pos += !is_member(member, b) as usize;
1937        }
1938        i += 1;
1939    }
1940
1941    out_pos
1942}
1943
1944fn delete_single_char_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1945    // Parallel path for large data: each thread deletes from its chunk,
1946    // then use writev to write all results in one syscall batch.
1947    if data.len() >= PARALLEL_THRESHOLD {
1948        let n_threads = rayon::current_num_threads().max(1);
1949        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1950
1951        let results: Vec<Vec<u8>> = data
1952            .par_chunks(chunk_size)
1953            .map(|chunk| {
1954                let mut out = Vec::with_capacity(chunk.len());
1955                let mut last = 0;
1956                for pos in memchr::memchr_iter(ch, chunk) {
1957                    if pos > last {
1958                        out.extend_from_slice(&chunk[last..pos]);
1959                    }
1960                    last = pos + 1;
1961                }
1962                if last < chunk.len() {
1963                    out.extend_from_slice(&chunk[last..]);
1964                }
1965                out
1966            })
1967            .collect();
1968
1969        // Use writev to batch all results into fewer syscalls
1970        let slices: Vec<std::io::IoSlice> = results
1971            .iter()
1972            .filter(|r| !r.is_empty())
1973            .map(|r| std::io::IoSlice::new(r))
1974            .collect();
1975        return write_ioslices(writer, &slices);
1976    }
1977
1978    // Single-write fast path: collect all non-deleted spans into one buffer
1979    if data.len() <= SINGLE_WRITE_LIMIT {
1980        let mut outbuf = Vec::with_capacity(data.len());
1981        let mut last = 0;
1982        for pos in memchr::memchr_iter(ch, data) {
1983            if pos > last {
1984                outbuf.extend_from_slice(&data[last..pos]);
1985            }
1986            last = pos + 1;
1987        }
1988        if last < data.len() {
1989            outbuf.extend_from_slice(&data[last..]);
1990        }
1991        return writer.write_all(&outbuf);
1992    }
1993
1994    // Chunked path for large data
1995    let buf_size = data.len().min(BUF_SIZE);
1996    let mut outbuf = vec![0u8; buf_size];
1997
1998    for chunk in data.chunks(buf_size) {
1999        let mut wp = 0;
2000        let mut last = 0;
2001        for pos in memchr::memchr_iter(ch, chunk) {
2002            if pos > last {
2003                let run = pos - last;
2004                outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
2005                wp += run;
2006            }
2007            last = pos + 1;
2008        }
2009        if last < chunk.len() {
2010            let run = chunk.len() - last;
2011            outbuf[wp..wp + run].copy_from_slice(&chunk[last..]);
2012            wp += run;
2013        }
2014        writer.write_all(&outbuf[..wp])?;
2015    }
2016    Ok(())
2017}
2018
2019fn delete_multi_memchr_mmap(chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2020    let c0 = chars[0];
2021    let c1 = if chars.len() >= 2 { chars[1] } else { 0 };
2022    let c2 = if chars.len() >= 3 { chars[2] } else { 0 };
2023    let is_three = chars.len() >= 3;
2024
2025    // Parallel path for large data
2026    if data.len() >= PARALLEL_THRESHOLD {
2027        let n_threads = rayon::current_num_threads().max(1);
2028        let chunk_size = (data.len() / n_threads).max(32 * 1024);
2029
2030        let results: Vec<Vec<u8>> = data
2031            .par_chunks(chunk_size)
2032            .map(|chunk| {
2033                let mut out = Vec::with_capacity(chunk.len());
2034                let mut last = 0;
2035                if is_three {
2036                    for pos in memchr::memchr3_iter(c0, c1, c2, chunk) {
2037                        if pos > last {
2038                            out.extend_from_slice(&chunk[last..pos]);
2039                        }
2040                        last = pos + 1;
2041                    }
2042                } else {
2043                    for pos in memchr::memchr2_iter(c0, c1, chunk) {
2044                        if pos > last {
2045                            out.extend_from_slice(&chunk[last..pos]);
2046                        }
2047                        last = pos + 1;
2048                    }
2049                }
2050                if last < chunk.len() {
2051                    out.extend_from_slice(&chunk[last..]);
2052                }
2053                out
2054            })
2055            .collect();
2056
2057        // Use writev to batch all results into fewer syscalls
2058        let slices: Vec<std::io::IoSlice> = results
2059            .iter()
2060            .filter(|r| !r.is_empty())
2061            .map(|r| std::io::IoSlice::new(r))
2062            .collect();
2063        return write_ioslices(writer, &slices);
2064    }
2065
2066    // Single-write fast path: collect all non-deleted spans into one buffer
2067    if data.len() <= SINGLE_WRITE_LIMIT {
2068        let mut outbuf = Vec::with_capacity(data.len());
2069        let mut last = 0;
2070        if is_three {
2071            for pos in memchr::memchr3_iter(c0, c1, c2, data) {
2072                if pos > last {
2073                    outbuf.extend_from_slice(&data[last..pos]);
2074                }
2075                last = pos + 1;
2076            }
2077        } else {
2078            for pos in memchr::memchr2_iter(c0, c1, data) {
2079                if pos > last {
2080                    outbuf.extend_from_slice(&data[last..pos]);
2081                }
2082                last = pos + 1;
2083            }
2084        }
2085        if last < data.len() {
2086            outbuf.extend_from_slice(&data[last..]);
2087        }
2088        return writer.write_all(&outbuf);
2089    }
2090
2091    // Chunked path for large data
2092    let buf_size = data.len().min(BUF_SIZE);
2093    let mut outbuf = vec![0u8; buf_size];
2094
2095    for chunk in data.chunks(buf_size) {
2096        let mut wp = 0;
2097        let mut last = 0;
2098
2099        // Iterate directly over memchr iterator without collecting into Vec<usize>.
2100        // Positions are used exactly once in order, so no intermediate allocation needed.
2101        if is_three {
2102            for pos in memchr::memchr3_iter(c0, c1, c2, chunk) {
2103                if pos > last {
2104                    let run = pos - last;
2105                    outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
2106                    wp += run;
2107                }
2108                last = pos + 1;
2109            }
2110        } else {
2111            for pos in memchr::memchr2_iter(c0, c1, chunk) {
2112                if pos > last {
2113                    let run = pos - last;
2114                    outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
2115                    wp += run;
2116                }
2117                last = pos + 1;
2118            }
2119        }
2120
2121        if last < chunk.len() {
2122            let run = chunk.len() - last;
2123            outbuf[wp..wp + run].copy_from_slice(&chunk[last..]);
2124            wp += run;
2125        }
2126        writer.write_all(&outbuf[..wp])?;
2127    }
2128    Ok(())
2129}
2130
2131/// Delete + squeeze from mmap'd byte slice.
2132///
2133/// For data <= 16MB: delete+squeeze into one buffer, one write syscall.
2134/// For data > 16MB: chunked approach to limit memory.
2135pub fn delete_squeeze_mmap(
2136    delete_chars: &[u8],
2137    squeeze_chars: &[u8],
2138    data: &[u8],
2139    writer: &mut impl Write,
2140) -> io::Result<()> {
2141    let delete_set = build_member_set(delete_chars);
2142    let squeeze_set = build_member_set(squeeze_chars);
2143
2144    // Single-write fast path: delete+squeeze all data in one pass, one write
2145    if data.len() <= SINGLE_WRITE_LIMIT {
2146        let mut outbuf: Vec<u8> = Vec::with_capacity(data.len());
2147        let mut last_squeezed: u16 = 256;
2148        unsafe {
2149            outbuf.set_len(data.len());
2150            let outp: *mut u8 = outbuf.as_mut_ptr();
2151            let inp = data.as_ptr();
2152            let len = data.len();
2153            let mut out_pos = 0;
2154            let mut i = 0;
2155            while i < len {
2156                let b = *inp.add(i);
2157                if is_member(&delete_set, b) {
2158                    i += 1;
2159                    continue;
2160                }
2161                if is_member(&squeeze_set, b) {
2162                    if last_squeezed == b as u16 {
2163                        i += 1;
2164                        continue;
2165                    }
2166                    last_squeezed = b as u16;
2167                } else {
2168                    last_squeezed = 256;
2169                }
2170                *outp.add(out_pos) = b;
2171                out_pos += 1;
2172                i += 1;
2173            }
2174            outbuf.set_len(out_pos);
2175        }
2176        return writer.write_all(&outbuf);
2177    }
2178
2179    // Chunked path for large data
2180    let buf_size = data.len().min(BUF_SIZE);
2181    let mut outbuf = vec![0u8; buf_size];
2182    let mut last_squeezed: u16 = 256;
2183
2184    for chunk in data.chunks(buf_size) {
2185        let mut out_pos = 0;
2186        for &b in chunk {
2187            if is_member(&delete_set, b) {
2188                continue;
2189            }
2190            if is_member(&squeeze_set, b) {
2191                if last_squeezed == b as u16 {
2192                    continue;
2193                }
2194                last_squeezed = b as u16;
2195            } else {
2196                last_squeezed = 256;
2197            }
2198            unsafe {
2199                *outbuf.get_unchecked_mut(out_pos) = b;
2200            }
2201            out_pos += 1;
2202        }
2203        writer.write_all(&outbuf[..out_pos])?;
2204    }
2205    Ok(())
2206}
2207
2208/// Squeeze from mmap'd byte slice.
2209///
2210/// For data >= 2MB: uses rayon parallel processing with boundary fixup.
2211/// For data <= 16MB: squeeze into one buffer, one write syscall.
2212/// For data > 16MB: chunked approach to limit memory.
2213pub fn squeeze_mmap(squeeze_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2214    if squeeze_chars.len() == 1 {
2215        return squeeze_single_mmap(squeeze_chars[0], data, writer);
2216    }
2217    if squeeze_chars.len() == 2 {
2218        return squeeze_multi_mmap::<2>(squeeze_chars, data, writer);
2219    }
2220    if squeeze_chars.len() == 3 {
2221        return squeeze_multi_mmap::<3>(squeeze_chars, data, writer);
2222    }
2223
2224    let member = build_member_set(squeeze_chars);
2225
2226    // Parallel path: squeeze each chunk independently, then fix boundaries
2227    if data.len() >= PARALLEL_THRESHOLD {
2228        let n_threads = rayon::current_num_threads().max(1);
2229        let chunk_size = (data.len() / n_threads).max(32 * 1024);
2230
2231        let results: Vec<Vec<u8>> = data
2232            .par_chunks(chunk_size)
2233            .map(|chunk| squeeze_chunk_bitset(chunk, &member))
2234            .collect();
2235
2236        // Build IoSlice list, fixing boundaries: if chunk N ends with byte B
2237        // and chunk N+1 starts with same byte B, and B is in squeeze set,
2238        // skip the first byte(s) of chunk N+1 that equal B.
2239        // Collect slices for writev to minimize syscalls.
2240        let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
2241        for (idx, result) in results.iter().enumerate() {
2242            if result.is_empty() {
2243                continue;
2244            }
2245            if idx > 0 {
2246                // Check boundary: does previous chunk end with same squeezable byte?
2247                if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
2248                    if is_member(&member, prev_last) {
2249                        // Skip leading bytes in this chunk that equal prev_last
2250                        let skip = result.iter().take_while(|&&b| b == prev_last).count();
2251                        if skip < result.len() {
2252                            slices.push(std::io::IoSlice::new(&result[skip..]));
2253                        }
2254                        continue;
2255                    }
2256                }
2257            }
2258            slices.push(std::io::IoSlice::new(result));
2259        }
2260        return write_ioslices(writer, &slices);
2261    }
2262
2263    // Single-write fast path: squeeze all data into one buffer, one write
2264    if data.len() <= SINGLE_WRITE_LIMIT {
2265        let mut outbuf: Vec<u8> = Vec::with_capacity(data.len());
2266        let mut last_squeezed: u16 = 256;
2267        let len = data.len();
2268        let mut wp = 0;
2269        let mut i = 0;
2270
2271        unsafe {
2272            outbuf.set_len(data.len());
2273            let inp = data.as_ptr();
2274            let outp: *mut u8 = outbuf.as_mut_ptr();
2275
2276            while i < len {
2277                let b = *inp.add(i);
2278                if is_member(&member, b) {
2279                    if last_squeezed != b as u16 {
2280                        *outp.add(wp) = b;
2281                        wp += 1;
2282                        last_squeezed = b as u16;
2283                    }
2284                    i += 1;
2285                    while i < len && *inp.add(i) == b {
2286                        i += 1;
2287                    }
2288                } else {
2289                    last_squeezed = 256;
2290                    *outp.add(wp) = b;
2291                    wp += 1;
2292                    i += 1;
2293                }
2294            }
2295            outbuf.set_len(wp);
2296        }
2297        return writer.write_all(&outbuf);
2298    }
2299
2300    // Chunked path for large data
2301    let buf_size = data.len().min(BUF_SIZE);
2302    let mut outbuf = vec![0u8; buf_size];
2303    let mut last_squeezed: u16 = 256;
2304
2305    for chunk in data.chunks(buf_size) {
2306        let len = chunk.len();
2307        let mut wp = 0;
2308        let mut i = 0;
2309
2310        unsafe {
2311            let inp = chunk.as_ptr();
2312            let outp = outbuf.as_mut_ptr();
2313
2314            while i < len {
2315                let b = *inp.add(i);
2316                if is_member(&member, b) {
2317                    if last_squeezed != b as u16 {
2318                        *outp.add(wp) = b;
2319                        wp += 1;
2320                        last_squeezed = b as u16;
2321                    }
2322                    i += 1;
2323                    while i < len && *inp.add(i) == b {
2324                        i += 1;
2325                    }
2326                } else {
2327                    last_squeezed = 256;
2328                    *outp.add(wp) = b;
2329                    wp += 1;
2330                    i += 1;
2331                }
2332            }
2333        }
2334        writer.write_all(&outbuf[..wp])?;
2335    }
2336    Ok(())
2337}
2338
2339/// Squeeze a single chunk using bitset membership. Returns squeezed output.
2340fn squeeze_chunk_bitset(chunk: &[u8], member: &[u8; 32]) -> Vec<u8> {
2341    let len = chunk.len();
2342    let mut out = Vec::with_capacity(len);
2343    let mut last_squeezed: u16 = 256;
2344    let mut i = 0;
2345
2346    unsafe {
2347        out.set_len(len);
2348        let inp = chunk.as_ptr();
2349        let outp: *mut u8 = out.as_mut_ptr();
2350        let mut wp = 0;
2351
2352        while i < len {
2353            let b = *inp.add(i);
2354            if is_member(member, b) {
2355                if last_squeezed != b as u16 {
2356                    *outp.add(wp) = b;
2357                    wp += 1;
2358                    last_squeezed = b as u16;
2359                }
2360                i += 1;
2361                while i < len && *inp.add(i) == b {
2362                    i += 1;
2363                }
2364            } else {
2365                last_squeezed = 256;
2366                *outp.add(wp) = b;
2367                wp += 1;
2368                i += 1;
2369            }
2370        }
2371        out.set_len(wp);
2372    }
2373    out
2374}
2375
2376fn squeeze_multi_mmap<const N: usize>(
2377    chars: &[u8],
2378    data: &[u8],
2379    writer: &mut impl Write,
2380) -> io::Result<()> {
2381    // Parallel path for large data: squeeze each chunk, fix boundaries with writev
2382    if data.len() >= PARALLEL_THRESHOLD {
2383        let member = build_member_set(chars);
2384        let n_threads = rayon::current_num_threads().max(1);
2385        let chunk_size = (data.len() / n_threads).max(32 * 1024);
2386
2387        let results: Vec<Vec<u8>> = data
2388            .par_chunks(chunk_size)
2389            .map(|chunk| squeeze_chunk_bitset(chunk, &member))
2390            .collect();
2391
2392        // Build IoSlice list, fixing boundaries
2393        let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
2394        for (idx, result) in results.iter().enumerate() {
2395            if result.is_empty() {
2396                continue;
2397            }
2398            if idx > 0 {
2399                if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
2400                    if is_member(&member, prev_last) {
2401                        let skip = result.iter().take_while(|&&b| b == prev_last).count();
2402                        if skip < result.len() {
2403                            slices.push(std::io::IoSlice::new(&result[skip..]));
2404                        }
2405                        continue;
2406                    }
2407                }
2408            }
2409            slices.push(std::io::IoSlice::new(result));
2410        }
2411        return write_ioslices(writer, &slices);
2412    }
2413
2414    let buf_size = data.len().min(BUF_SIZE);
2415    let mut outbuf = vec![0u8; buf_size];
2416    let mut wp = 0;
2417    let mut last_squeezed: u16 = 256;
2418    let mut cursor = 0;
2419
2420    macro_rules! find_next {
2421        ($data:expr) => {
2422            if N == 2 {
2423                memchr::memchr2(chars[0], chars[1], $data)
2424            } else {
2425                memchr::memchr3(chars[0], chars[1], chars[2], $data)
2426            }
2427        };
2428    }
2429
2430    macro_rules! flush_and_copy {
2431        ($src:expr, $len:expr) => {
2432            if wp + $len > buf_size {
2433                writer.write_all(&outbuf[..wp])?;
2434                wp = 0;
2435            }
2436            if $len > buf_size {
2437                writer.write_all($src)?;
2438            } else {
2439                outbuf[wp..wp + $len].copy_from_slice($src);
2440                wp += $len;
2441            }
2442        };
2443    }
2444
2445    while cursor < data.len() {
2446        match find_next!(&data[cursor..]) {
2447            Some(offset) => {
2448                let pos = cursor + offset;
2449                let b = data[pos];
2450                if pos > cursor {
2451                    let span = pos - cursor;
2452                    flush_and_copy!(&data[cursor..pos], span);
2453                    last_squeezed = 256;
2454                }
2455                if last_squeezed != b as u16 {
2456                    if wp >= buf_size {
2457                        writer.write_all(&outbuf[..wp])?;
2458                        wp = 0;
2459                    }
2460                    outbuf[wp] = b;
2461                    wp += 1;
2462                    last_squeezed = b as u16;
2463                }
2464                let mut skip = pos + 1;
2465                while skip < data.len() && data[skip] == b {
2466                    skip += 1;
2467                }
2468                cursor = skip;
2469            }
2470            None => {
2471                let remaining = data.len() - cursor;
2472                flush_and_copy!(&data[cursor..], remaining);
2473                break;
2474            }
2475        }
2476    }
2477    if wp > 0 {
2478        writer.write_all(&outbuf[..wp])?;
2479    }
2480    Ok(())
2481}
2482
2483fn squeeze_single_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2484    if data.is_empty() {
2485        return Ok(());
2486    }
2487
2488    if memchr::memmem::find(data, &[ch, ch]).is_none() {
2489        return writer.write_all(data);
2490    }
2491
2492    // Parallel path: squeeze each chunk, fix boundaries
2493    if data.len() >= PARALLEL_THRESHOLD {
2494        let n_threads = rayon::current_num_threads().max(1);
2495        let chunk_size = (data.len() / n_threads).max(32 * 1024);
2496
2497        let results: Vec<Vec<u8>> = data
2498            .par_chunks(chunk_size)
2499            .map(|chunk| {
2500                let mut out = Vec::with_capacity(chunk.len());
2501                let mut cursor = 0;
2502                while cursor < chunk.len() {
2503                    match memchr::memchr(ch, &chunk[cursor..]) {
2504                        Some(offset) => {
2505                            let pos = cursor + offset;
2506                            if pos > cursor {
2507                                out.extend_from_slice(&chunk[cursor..pos]);
2508                            }
2509                            out.push(ch);
2510                            cursor = pos + 1;
2511                            while cursor < chunk.len() && chunk[cursor] == ch {
2512                                cursor += 1;
2513                            }
2514                        }
2515                        None => {
2516                            out.extend_from_slice(&chunk[cursor..]);
2517                            break;
2518                        }
2519                    }
2520                }
2521                out
2522            })
2523            .collect();
2524
2525        // Build IoSlice list, fixing boundary squeezability.
2526        // Use writev to minimize syscalls.
2527        let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
2528        for (idx, result) in results.iter().enumerate() {
2529            if result.is_empty() {
2530                continue;
2531            }
2532            if idx > 0 {
2533                if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
2534                    if prev_last == ch {
2535                        // Skip leading ch bytes in this chunk result
2536                        let skip = result.iter().take_while(|&&b| b == ch).count();
2537                        if skip < result.len() {
2538                            slices.push(std::io::IoSlice::new(&result[skip..]));
2539                        }
2540                        continue;
2541                    }
2542                }
2543            }
2544            slices.push(std::io::IoSlice::new(result));
2545        }
2546        return write_ioslices(writer, &slices);
2547    }
2548
2549    let buf_size = data.len().min(BUF_SIZE);
2550    let mut outbuf = vec![0u8; buf_size];
2551    let len = data.len();
2552    let mut wp = 0;
2553    let mut cursor = 0;
2554
2555    while cursor < len {
2556        match memchr::memchr(ch, &data[cursor..]) {
2557            Some(offset) => {
2558                let pos = cursor + offset;
2559                let gap = pos - cursor;
2560                if gap > 0 {
2561                    if wp + gap > buf_size {
2562                        writer.write_all(&outbuf[..wp])?;
2563                        wp = 0;
2564                    }
2565                    if gap > buf_size {
2566                        writer.write_all(&data[cursor..pos])?;
2567                    } else {
2568                        outbuf[wp..wp + gap].copy_from_slice(&data[cursor..pos]);
2569                        wp += gap;
2570                    }
2571                }
2572                if wp >= buf_size {
2573                    writer.write_all(&outbuf[..wp])?;
2574                    wp = 0;
2575                }
2576                outbuf[wp] = ch;
2577                wp += 1;
2578                cursor = pos + 1;
2579                while cursor < len && data[cursor] == ch {
2580                    cursor += 1;
2581                }
2582            }
2583            None => {
2584                let remaining = len - cursor;
2585                if remaining > 0 {
2586                    if wp + remaining > buf_size {
2587                        writer.write_all(&outbuf[..wp])?;
2588                        wp = 0;
2589                    }
2590                    if remaining > buf_size {
2591                        writer.write_all(&data[cursor..])?;
2592                    } else {
2593                        outbuf[wp..wp + remaining].copy_from_slice(&data[cursor..]);
2594                        wp += remaining;
2595                    }
2596                }
2597                break;
2598            }
2599        }
2600    }
2601
2602    if wp > 0 {
2603        writer.write_all(&outbuf[..wp])?;
2604    }
2605    Ok(())
2606}