Skip to main content

coreutils_rs/tr/
core.rs

1use std::io::{self, Read, Write};
2
3use rayon::prelude::*;
4
5/// Maximum IoSlice entries per write_vectored batch.
6/// Linux UIO_MAXIOV is 1024; we use that as our batch limit.
7const MAX_IOV: usize = 1024;
8
9/// Main processing buffer: 4MB (fits in L3 cache, avoids cache thrashing).
10const BUF_SIZE: usize = 4 * 1024 * 1024;
11
12/// Stream buffer: 1MB — fits in L2/L3 cache so data stays hot between read,
13/// process, and write phases. Larger buffers (4-8MB) cause cache thrashing.
14const STREAM_BUF: usize = 1024 * 1024;
15
16/// Translate stream buffer: 4MB — translate is compute-light (single table lookup
17/// per byte) so the bottleneck is I/O syscalls, not cache pressure. Larger buffer
18/// means fewer read()/write() syscall pairs (3 for 10MB instead of 10).
19const TRANSLATE_STREAM_BUF: usize = 4 * 1024 * 1024;
20
21/// Minimum data size to engage rayon parallel processing for mmap paths.
22/// Below this, single-threaded is faster due to thread pool overhead.
23const PARALLEL_THRESHOLD: usize = 2 * 1024 * 1024;
24
25/// Write multiple IoSlice buffers using write_vectored, batching into MAX_IOV-sized groups.
26/// Falls back to write_all per slice for partial writes.
27#[inline]
28fn write_ioslices(writer: &mut impl Write, slices: &[std::io::IoSlice]) -> io::Result<()> {
29    if slices.is_empty() {
30        return Ok(());
31    }
32    for batch in slices.chunks(MAX_IOV) {
33        let total: usize = batch.iter().map(|s| s.len()).sum();
34        match writer.write_vectored(batch) {
35            Ok(n) if n >= total => continue,
36            Ok(mut written) => {
37                // Partial write: fall back to write_all per remaining slice
38                for slice in batch {
39                    let slen = slice.len();
40                    if written >= slen {
41                        written -= slen;
42                        continue;
43                    }
44                    if written > 0 {
45                        writer.write_all(&slice[written..])?;
46                        written = 0;
47                    } else {
48                        writer.write_all(slice)?;
49                    }
50                }
51            }
52            Err(e) => return Err(e),
53        }
54    }
55    Ok(())
56}
57
58/// Allocate a Vec<u8> of given length without zero-initialization.
59/// SAFETY: Caller must write all bytes before reading them.
60#[inline]
61#[allow(clippy::uninit_vec)]
62fn alloc_uninit_vec(len: usize) -> Vec<u8> {
63    let mut v = Vec::with_capacity(len);
64    // SAFETY: u8 has no drop, no invalid bit patterns; caller will overwrite before reading
65    unsafe {
66        v.set_len(len);
67    }
68    v
69}
70
71/// Build a 256-byte lookup table mapping set1[i] -> set2[i].
72#[inline]
73fn build_translate_table(set1: &[u8], set2: &[u8]) -> [u8; 256] {
74    let mut table: [u8; 256] = std::array::from_fn(|i| i as u8);
75    let last = set2.last().copied();
76    for (i, &from) in set1.iter().enumerate() {
77        table[from as usize] = if i < set2.len() {
78            set2[i]
79        } else {
80            last.unwrap_or(from)
81        };
82    }
83    table
84}
85
86/// Build a 256-bit (32-byte) membership set for O(1) byte lookup.
87#[inline]
88fn build_member_set(chars: &[u8]) -> [u8; 32] {
89    let mut set = [0u8; 32];
90    for &ch in chars {
91        set[ch as usize >> 3] |= 1 << (ch & 7);
92    }
93    set
94}
95
96#[inline(always)]
97fn is_member(set: &[u8; 32], ch: u8) -> bool {
98    unsafe { (*set.get_unchecked(ch as usize >> 3) & (1 << (ch & 7))) != 0 }
99}
100
101/// Translate bytes in-place using a 256-byte lookup table.
102/// On x86_64 with SSSE3+, uses SIMD pshufb-based nibble decomposition for
103/// ~32 bytes per iteration. Falls back to 8x-unrolled scalar on other platforms.
104#[inline(always)]
105fn translate_inplace(data: &mut [u8], table: &[u8; 256]) {
106    #[cfg(target_arch = "x86_64")]
107    {
108        if is_x86_feature_detected!("avx2") {
109            unsafe { translate_inplace_avx2_table(data, table) };
110            return;
111        }
112        if is_x86_feature_detected!("ssse3") {
113            unsafe { translate_inplace_ssse3_table(data, table) };
114            return;
115        }
116    }
117    translate_inplace_scalar(data, table);
118}
119
120/// Scalar fallback: 8x-unrolled table lookup.
121#[inline(always)]
122fn translate_inplace_scalar(data: &mut [u8], table: &[u8; 256]) {
123    let len = data.len();
124    let ptr = data.as_mut_ptr();
125    let mut i = 0;
126    unsafe {
127        while i + 8 <= len {
128            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
129            *ptr.add(i + 1) = *table.get_unchecked(*ptr.add(i + 1) as usize);
130            *ptr.add(i + 2) = *table.get_unchecked(*ptr.add(i + 2) as usize);
131            *ptr.add(i + 3) = *table.get_unchecked(*ptr.add(i + 3) as usize);
132            *ptr.add(i + 4) = *table.get_unchecked(*ptr.add(i + 4) as usize);
133            *ptr.add(i + 5) = *table.get_unchecked(*ptr.add(i + 5) as usize);
134            *ptr.add(i + 6) = *table.get_unchecked(*ptr.add(i + 6) as usize);
135            *ptr.add(i + 7) = *table.get_unchecked(*ptr.add(i + 7) as usize);
136            i += 8;
137        }
138        while i < len {
139            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
140            i += 1;
141        }
142    }
143}
144
145// ============================================================================
146// SIMD arbitrary table lookup using pshufb nibble decomposition (x86_64)
147// ============================================================================
148//
149// For an arbitrary 256-byte lookup table, we decompose each byte into
150// high nibble (bits 7-4) and low nibble (bits 3-0). We pre-build 16
151// SIMD vectors, one for each high nibble value h (0..15), containing
152// the 16 table entries table[h*16+0..h*16+15]. Then for each input
153// vector we:
154//   1. Extract low nibble (AND 0x0F) -> used as pshufb index
155//   2. Extract high nibble (shift right 4) -> used to select which table
156//   3. For each of the 16 high nibble values, create a mask where
157//      the high nibble equals that value, pshufb the corresponding
158//      table, and accumulate results
159//
160// AVX2 processes 32 bytes/iteration; SSSE3 processes 16 bytes/iteration.
161// With instruction-level parallelism, this achieves much higher throughput
162// than scalar table lookups which have serial data dependencies.
163
164#[cfg(target_arch = "x86_64")]
165#[target_feature(enable = "avx2")]
166unsafe fn translate_inplace_avx2_table(data: &mut [u8], table: &[u8; 256]) {
167    use std::arch::x86_64::*;
168
169    unsafe {
170        let len = data.len();
171        let ptr = data.as_mut_ptr();
172
173        // Pre-build 16 lookup vectors, one per high nibble value.
174        // Each vector holds 32 bytes = 2x 128-bit lanes, each lane has the same
175        // 16 table entries for pshufb indexing by low nibble.
176        let mut lut = [_mm256_setzero_si256(); 16];
177        for h in 0u8..16 {
178            let base = (h as usize) * 16;
179            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
180            // Broadcast the 128-bit row to both lanes of the 256-bit vector
181            let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
182            lut[h as usize] = _mm256_broadcastsi128_si256(row128);
183        }
184
185        let lo_mask = _mm256_set1_epi8(0x0F);
186
187        let mut i = 0;
188        while i + 32 <= len {
189            let input = _mm256_loadu_si256(ptr.add(i) as *const _);
190            let lo_nibble = _mm256_and_si256(input, lo_mask);
191            let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
192
193            // Accumulate result: for each high nibble value h, select bytes where
194            // hi_nibble == h, look up in lut[h] using lo_nibble, blend into result.
195            let mut result = _mm256_setzero_si256();
196
197            // Unroll the 16 high-nibble iterations for best ILP
198            macro_rules! do_nibble {
199                ($h:expr) => {
200                    let h_val = _mm256_set1_epi8($h as i8);
201                    let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
202                    let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
203                    result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
204                };
205            }
206            do_nibble!(0);
207            do_nibble!(1);
208            do_nibble!(2);
209            do_nibble!(3);
210            do_nibble!(4);
211            do_nibble!(5);
212            do_nibble!(6);
213            do_nibble!(7);
214            do_nibble!(8);
215            do_nibble!(9);
216            do_nibble!(10);
217            do_nibble!(11);
218            do_nibble!(12);
219            do_nibble!(13);
220            do_nibble!(14);
221            do_nibble!(15);
222
223            _mm256_storeu_si256(ptr.add(i) as *mut _, result);
224            i += 32;
225        }
226
227        // SSE/SSSE3 tail for remaining 16-byte chunk
228        if i + 16 <= len {
229            let lo_mask128 = _mm_set1_epi8(0x0F);
230
231            // Build 128-bit LUTs (just the lower lane of each 256-bit LUT)
232            let mut lut128 = [_mm_setzero_si128(); 16];
233            for h in 0u8..16 {
234                lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
235            }
236
237            let input = _mm_loadu_si128(ptr.add(i) as *const _);
238            let lo_nib = _mm_and_si128(input, lo_mask128);
239            let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
240
241            let mut res = _mm_setzero_si128();
242            macro_rules! do_nibble128 {
243                ($h:expr) => {
244                    let h_val = _mm_set1_epi8($h as i8);
245                    let mask = _mm_cmpeq_epi8(hi_nib, h_val);
246                    let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
247                    res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
248                };
249            }
250            do_nibble128!(0);
251            do_nibble128!(1);
252            do_nibble128!(2);
253            do_nibble128!(3);
254            do_nibble128!(4);
255            do_nibble128!(5);
256            do_nibble128!(6);
257            do_nibble128!(7);
258            do_nibble128!(8);
259            do_nibble128!(9);
260            do_nibble128!(10);
261            do_nibble128!(11);
262            do_nibble128!(12);
263            do_nibble128!(13);
264            do_nibble128!(14);
265            do_nibble128!(15);
266
267            _mm_storeu_si128(ptr.add(i) as *mut _, res);
268            i += 16;
269        }
270
271        // Scalar tail
272        while i < len {
273            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
274            i += 1;
275        }
276    }
277}
278
279#[cfg(target_arch = "x86_64")]
280#[target_feature(enable = "ssse3")]
281unsafe fn translate_inplace_ssse3_table(data: &mut [u8], table: &[u8; 256]) {
282    use std::arch::x86_64::*;
283
284    unsafe {
285        let len = data.len();
286        let ptr = data.as_mut_ptr();
287
288        // Pre-build 16 lookup vectors for pshufb
289        let mut lut = [_mm_setzero_si128(); 16];
290        for h in 0u8..16 {
291            let base = (h as usize) * 16;
292            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
293            lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
294        }
295
296        let lo_mask = _mm_set1_epi8(0x0F);
297
298        let mut i = 0;
299        while i + 16 <= len {
300            let input = _mm_loadu_si128(ptr.add(i) as *const _);
301            let lo_nibble = _mm_and_si128(input, lo_mask);
302            let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
303
304            let mut result = _mm_setzero_si128();
305
306            macro_rules! do_nibble {
307                ($h:expr) => {
308                    let h_val = _mm_set1_epi8($h as i8);
309                    let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
310                    let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
311                    result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
312                };
313            }
314            do_nibble!(0);
315            do_nibble!(1);
316            do_nibble!(2);
317            do_nibble!(3);
318            do_nibble!(4);
319            do_nibble!(5);
320            do_nibble!(6);
321            do_nibble!(7);
322            do_nibble!(8);
323            do_nibble!(9);
324            do_nibble!(10);
325            do_nibble!(11);
326            do_nibble!(12);
327            do_nibble!(13);
328            do_nibble!(14);
329            do_nibble!(15);
330
331            _mm_storeu_si128(ptr.add(i) as *mut _, result);
332            i += 16;
333        }
334
335        // Scalar tail
336        while i < len {
337            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
338            i += 1;
339        }
340    }
341}
342
343/// Translate bytes from source to destination using a 256-byte lookup table.
344/// On x86_64 with SSSE3+, uses SIMD pshufb-based nibble decomposition.
345#[inline(always)]
346fn translate_to(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
347    debug_assert!(dst.len() >= src.len());
348    #[cfg(target_arch = "x86_64")]
349    {
350        if is_x86_feature_detected!("avx2") {
351            unsafe { translate_to_avx2_table(src, dst, table) };
352            return;
353        }
354        if is_x86_feature_detected!("ssse3") {
355            unsafe { translate_to_ssse3_table(src, dst, table) };
356            return;
357        }
358    }
359    translate_to_scalar(src, dst, table);
360}
361
362/// Scalar fallback for translate_to.
363#[inline(always)]
364fn translate_to_scalar(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
365    unsafe {
366        let sp = src.as_ptr();
367        let dp = dst.as_mut_ptr();
368        let len = src.len();
369        let mut i = 0;
370        while i + 8 <= len {
371            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
372            *dp.add(i + 1) = *table.get_unchecked(*sp.add(i + 1) as usize);
373            *dp.add(i + 2) = *table.get_unchecked(*sp.add(i + 2) as usize);
374            *dp.add(i + 3) = *table.get_unchecked(*sp.add(i + 3) as usize);
375            *dp.add(i + 4) = *table.get_unchecked(*sp.add(i + 4) as usize);
376            *dp.add(i + 5) = *table.get_unchecked(*sp.add(i + 5) as usize);
377            *dp.add(i + 6) = *table.get_unchecked(*sp.add(i + 6) as usize);
378            *dp.add(i + 7) = *table.get_unchecked(*sp.add(i + 7) as usize);
379            i += 8;
380        }
381        while i < len {
382            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
383            i += 1;
384        }
385    }
386}
387
388#[cfg(target_arch = "x86_64")]
389#[target_feature(enable = "avx2")]
390unsafe fn translate_to_avx2_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
391    use std::arch::x86_64::*;
392
393    unsafe {
394        let len = src.len();
395        let sp = src.as_ptr();
396        let dp = dst.as_mut_ptr();
397
398        // Pre-build 16 lookup vectors
399        let mut lut = [_mm256_setzero_si256(); 16];
400        for h in 0u8..16 {
401            let base = (h as usize) * 16;
402            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
403            let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
404            lut[h as usize] = _mm256_broadcastsi128_si256(row128);
405        }
406
407        let lo_mask = _mm256_set1_epi8(0x0F);
408
409        let mut i = 0;
410        while i + 32 <= len {
411            let input = _mm256_loadu_si256(sp.add(i) as *const _);
412            let lo_nibble = _mm256_and_si256(input, lo_mask);
413            let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
414
415            let mut result = _mm256_setzero_si256();
416
417            macro_rules! do_nibble {
418                ($h:expr) => {
419                    let h_val = _mm256_set1_epi8($h as i8);
420                    let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
421                    let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
422                    result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
423                };
424            }
425            do_nibble!(0);
426            do_nibble!(1);
427            do_nibble!(2);
428            do_nibble!(3);
429            do_nibble!(4);
430            do_nibble!(5);
431            do_nibble!(6);
432            do_nibble!(7);
433            do_nibble!(8);
434            do_nibble!(9);
435            do_nibble!(10);
436            do_nibble!(11);
437            do_nibble!(12);
438            do_nibble!(13);
439            do_nibble!(14);
440            do_nibble!(15);
441
442            _mm256_storeu_si256(dp.add(i) as *mut _, result);
443            i += 32;
444        }
445
446        // SSSE3 tail for remaining 16-byte chunk
447        if i + 16 <= len {
448            let lo_mask128 = _mm_set1_epi8(0x0F);
449            let mut lut128 = [_mm_setzero_si128(); 16];
450            for h in 0u8..16 {
451                lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
452            }
453
454            let input = _mm_loadu_si128(sp.add(i) as *const _);
455            let lo_nib = _mm_and_si128(input, lo_mask128);
456            let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
457
458            let mut res = _mm_setzero_si128();
459            macro_rules! do_nibble128 {
460                ($h:expr) => {
461                    let h_val = _mm_set1_epi8($h as i8);
462                    let mask = _mm_cmpeq_epi8(hi_nib, h_val);
463                    let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
464                    res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
465                };
466            }
467            do_nibble128!(0);
468            do_nibble128!(1);
469            do_nibble128!(2);
470            do_nibble128!(3);
471            do_nibble128!(4);
472            do_nibble128!(5);
473            do_nibble128!(6);
474            do_nibble128!(7);
475            do_nibble128!(8);
476            do_nibble128!(9);
477            do_nibble128!(10);
478            do_nibble128!(11);
479            do_nibble128!(12);
480            do_nibble128!(13);
481            do_nibble128!(14);
482            do_nibble128!(15);
483
484            _mm_storeu_si128(dp.add(i) as *mut _, res);
485            i += 16;
486        }
487
488        // Scalar tail
489        while i < len {
490            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
491            i += 1;
492        }
493    }
494}
495
496#[cfg(target_arch = "x86_64")]
497#[target_feature(enable = "ssse3")]
498unsafe fn translate_to_ssse3_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
499    use std::arch::x86_64::*;
500
501    unsafe {
502        let len = src.len();
503        let sp = src.as_ptr();
504        let dp = dst.as_mut_ptr();
505
506        let mut lut = [_mm_setzero_si128(); 16];
507        for h in 0u8..16 {
508            let base = (h as usize) * 16;
509            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
510            lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
511        }
512
513        let lo_mask = _mm_set1_epi8(0x0F);
514
515        let mut i = 0;
516        while i + 16 <= len {
517            let input = _mm_loadu_si128(sp.add(i) as *const _);
518            let lo_nibble = _mm_and_si128(input, lo_mask);
519            let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
520
521            let mut result = _mm_setzero_si128();
522
523            macro_rules! do_nibble {
524                ($h:expr) => {
525                    let h_val = _mm_set1_epi8($h as i8);
526                    let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
527                    let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
528                    result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
529                };
530            }
531            do_nibble!(0);
532            do_nibble!(1);
533            do_nibble!(2);
534            do_nibble!(3);
535            do_nibble!(4);
536            do_nibble!(5);
537            do_nibble!(6);
538            do_nibble!(7);
539            do_nibble!(8);
540            do_nibble!(9);
541            do_nibble!(10);
542            do_nibble!(11);
543            do_nibble!(12);
544            do_nibble!(13);
545            do_nibble!(14);
546            do_nibble!(15);
547
548            _mm_storeu_si128(dp.add(i) as *mut _, result);
549            i += 16;
550        }
551
552        // Scalar tail
553        while i < len {
554            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
555            i += 1;
556        }
557    }
558}
559
560// ============================================================================
561// SIMD range translation (x86_64)
562// ============================================================================
563
564/// Detect if the translate table is a single contiguous range with constant offset.
565/// Returns Some((lo, hi, offset)) if all non-identity entries form [lo..=hi] with
566/// table[i] = i + offset for all i in [lo, hi].
567#[inline]
568fn detect_range_offset(table: &[u8; 256]) -> Option<(u8, u8, i8)> {
569    let mut lo: Option<u8> = None;
570    let mut hi = 0u8;
571    let mut offset = 0i16;
572
573    for i in 0..256 {
574        if table[i] != i as u8 {
575            let diff = table[i] as i16 - i as i16;
576            match lo {
577                None => {
578                    lo = Some(i as u8);
579                    hi = i as u8;
580                    offset = diff;
581                }
582                Some(_) => {
583                    if diff != offset || i as u8 != hi.wrapping_add(1) {
584                        return None;
585                    }
586                    hi = i as u8;
587                }
588            }
589        }
590    }
591
592    lo.map(|l| (l, hi, offset as i8))
593}
594
595/// SIMD-accelerated range translation for mmap'd data.
596/// For tables where only a contiguous range [lo..=hi] is translated by a constant offset,
597/// uses AVX2 (32 bytes/iter) or SSE2 (16 bytes/iter) vectorized arithmetic.
598#[cfg(target_arch = "x86_64")]
599fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
600    if is_x86_feature_detected!("avx2") {
601        unsafe { translate_range_avx2(src, dst, lo, hi, offset) };
602    } else {
603        unsafe { translate_range_sse2(src, dst, lo, hi, offset) };
604    }
605}
606
607#[cfg(target_arch = "x86_64")]
608#[target_feature(enable = "avx2")]
609unsafe fn translate_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
610    use std::arch::x86_64::*;
611
612    unsafe {
613        let range = hi - lo;
614        // Bias: shift range so lo maps to -128 (signed min).
615        // For input in [lo, hi]: biased = input + (0x80 - lo) is in [-128, -128+range].
616        // For input < lo: biased wraps to large positive (signed), > threshold.
617        // For input > hi: biased > -128+range, > threshold.
618        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
619        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
620        let offset_v = _mm256_set1_epi8(offset);
621        let zero = _mm256_setzero_si256();
622
623        let len = src.len();
624        let mut i = 0;
625
626        while i + 32 <= len {
627            let input = _mm256_loadu_si256(src.as_ptr().add(i) as *const _);
628            let biased = _mm256_add_epi8(input, bias_v);
629            // gt = 0xFF where biased > threshold (OUT of range)
630            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
631            // mask = 0xFF where IN range (NOT gt)
632            let mask = _mm256_cmpeq_epi8(gt, zero);
633            let offset_masked = _mm256_and_si256(mask, offset_v);
634            let result = _mm256_add_epi8(input, offset_masked);
635            _mm256_storeu_si256(dst.as_mut_ptr().add(i) as *mut _, result);
636            i += 32;
637        }
638
639        // SSE2 tail for 16-byte remainder
640        if i + 16 <= len {
641            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
642            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
643            let offset_v128 = _mm_set1_epi8(offset);
644            let zero128 = _mm_setzero_si128();
645
646            let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
647            let biased = _mm_add_epi8(input, bias_v128);
648            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
649            let mask = _mm_cmpeq_epi8(gt, zero128);
650            let offset_masked = _mm_and_si128(mask, offset_v128);
651            let result = _mm_add_epi8(input, offset_masked);
652            _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
653            i += 16;
654        }
655
656        // Scalar tail
657        while i < len {
658            let b = *src.get_unchecked(i);
659            *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
660                b.wrapping_add(offset as u8)
661            } else {
662                b
663            };
664            i += 1;
665        }
666    }
667}
668
669#[cfg(target_arch = "x86_64")]
670#[target_feature(enable = "sse2")]
671unsafe fn translate_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
672    use std::arch::x86_64::*;
673
674    unsafe {
675        let range = hi - lo;
676        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
677        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
678        let offset_v = _mm_set1_epi8(offset);
679        let zero = _mm_setzero_si128();
680
681        let len = src.len();
682        let mut i = 0;
683
684        while i + 16 <= len {
685            let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
686            let biased = _mm_add_epi8(input, bias_v);
687            let gt = _mm_cmpgt_epi8(biased, threshold_v);
688            let mask = _mm_cmpeq_epi8(gt, zero);
689            let offset_masked = _mm_and_si128(mask, offset_v);
690            let result = _mm_add_epi8(input, offset_masked);
691            _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
692            i += 16;
693        }
694
695        while i < len {
696            let b = *src.get_unchecked(i);
697            *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
698                b.wrapping_add(offset as u8)
699            } else {
700                b
701            };
702            i += 1;
703        }
704    }
705}
706
707/// Scalar range translation fallback for non-x86_64.
708#[cfg(not(target_arch = "x86_64"))]
709fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
710    for (i, &b) in src.iter().enumerate() {
711        dst[i] = if b >= lo && b <= hi {
712            b.wrapping_add(offset as u8)
713        } else {
714            b
715        };
716    }
717}
718
719// ============================================================================
720// In-place SIMD range translation (saves one buffer allocation in streaming)
721// ============================================================================
722
723/// In-place SIMD-accelerated range translation.
724/// Translates bytes in [lo..=hi] by adding `offset`, leaving others unchanged.
725/// Operates on the buffer in-place, eliminating the need for a separate output buffer.
726#[cfg(target_arch = "x86_64")]
727fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
728    if is_x86_feature_detected!("avx2") {
729        unsafe { translate_range_avx2_inplace(data, lo, hi, offset) };
730    } else {
731        unsafe { translate_range_sse2_inplace(data, lo, hi, offset) };
732    }
733}
734
735#[cfg(target_arch = "x86_64")]
736#[target_feature(enable = "avx2")]
737unsafe fn translate_range_avx2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
738    use std::arch::x86_64::*;
739
740    unsafe {
741        let range = hi - lo;
742        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
743        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
744        let offset_v = _mm256_set1_epi8(offset);
745        let zero = _mm256_setzero_si256();
746
747        let len = data.len();
748        let ptr = data.as_mut_ptr();
749        let mut i = 0;
750
751        while i + 32 <= len {
752            let input = _mm256_loadu_si256(ptr.add(i) as *const _);
753            let biased = _mm256_add_epi8(input, bias_v);
754            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
755            let mask = _mm256_cmpeq_epi8(gt, zero);
756            let offset_masked = _mm256_and_si256(mask, offset_v);
757            let result = _mm256_add_epi8(input, offset_masked);
758            _mm256_storeu_si256(ptr.add(i) as *mut _, result);
759            i += 32;
760        }
761
762        if i + 16 <= len {
763            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
764            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
765            let offset_v128 = _mm_set1_epi8(offset);
766            let zero128 = _mm_setzero_si128();
767
768            let input = _mm_loadu_si128(ptr.add(i) as *const _);
769            let biased = _mm_add_epi8(input, bias_v128);
770            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
771            let mask = _mm_cmpeq_epi8(gt, zero128);
772            let offset_masked = _mm_and_si128(mask, offset_v128);
773            let result = _mm_add_epi8(input, offset_masked);
774            _mm_storeu_si128(ptr.add(i) as *mut _, result);
775            i += 16;
776        }
777
778        while i < len {
779            let b = *ptr.add(i);
780            *ptr.add(i) = if b >= lo && b <= hi {
781                b.wrapping_add(offset as u8)
782            } else {
783                b
784            };
785            i += 1;
786        }
787    }
788}
789
790#[cfg(target_arch = "x86_64")]
791#[target_feature(enable = "sse2")]
792unsafe fn translate_range_sse2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
793    use std::arch::x86_64::*;
794
795    unsafe {
796        let range = hi - lo;
797        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
798        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
799        let offset_v = _mm_set1_epi8(offset);
800        let zero = _mm_setzero_si128();
801
802        let len = data.len();
803        let ptr = data.as_mut_ptr();
804        let mut i = 0;
805
806        while i + 16 <= len {
807            let input = _mm_loadu_si128(ptr.add(i) as *const _);
808            let biased = _mm_add_epi8(input, bias_v);
809            let gt = _mm_cmpgt_epi8(biased, threshold_v);
810            let mask = _mm_cmpeq_epi8(gt, zero);
811            let offset_masked = _mm_and_si128(mask, offset_v);
812            let result = _mm_add_epi8(input, offset_masked);
813            _mm_storeu_si128(ptr.add(i) as *mut _, result);
814            i += 16;
815        }
816
817        while i < len {
818            let b = *ptr.add(i);
819            *ptr.add(i) = if b >= lo && b <= hi {
820                b.wrapping_add(offset as u8)
821            } else {
822                b
823            };
824            i += 1;
825        }
826    }
827}
828
829#[cfg(not(target_arch = "x86_64"))]
830fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
831    for b in data.iter_mut() {
832        if *b >= lo && *b <= hi {
833            *b = b.wrapping_add(offset as u8);
834        }
835    }
836}
837
838// ============================================================================
839// SIMD range deletion (x86_64)
840// ============================================================================
841
842/// Detect if ALL delete characters form a single contiguous byte range [lo..=hi].
843/// Returns Some((lo, hi)) if so. This is true for common classes:
844/// - `[:digit:]` = 0x30..=0x39
845/// - `a-z` = 0x61..=0x7A
846/// - `A-Z` = 0x41..=0x5A
847#[inline]
848fn detect_delete_range(chars: &[u8]) -> Option<(u8, u8)> {
849    if chars.is_empty() {
850        return None;
851    }
852    let mut lo = chars[0];
853    let mut hi = chars[0];
854    for &c in &chars[1..] {
855        if c < lo {
856            lo = c;
857        }
858        if c > hi {
859            hi = c;
860        }
861    }
862    // Check that the range size matches the number of chars (no gaps)
863    if (hi - lo + 1) as usize == chars.len() {
864        Some((lo, hi))
865    } else {
866        None
867    }
868}
869
870/// SIMD-accelerated delete for contiguous byte ranges.
871/// Uses the same bias+threshold trick as range translate to identify bytes in [lo..=hi],
872/// then compacts output by skipping matched bytes.
873#[cfg(target_arch = "x86_64")]
874fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
875    if is_x86_feature_detected!("avx2") {
876        unsafe { delete_range_avx2(src, dst, lo, hi) }
877    } else {
878        unsafe { delete_range_sse2(src, dst, lo, hi) }
879    }
880}
881
882#[cfg(target_arch = "x86_64")]
883#[target_feature(enable = "avx2")]
884unsafe fn delete_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
885    use std::arch::x86_64::*;
886
887    unsafe {
888        let range = hi - lo;
889        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
890        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
891        let zero = _mm256_setzero_si256();
892
893        let len = src.len();
894        let sp = src.as_ptr();
895        let dp = dst.as_mut_ptr();
896        let mut ri = 0;
897        let mut wp = 0;
898
899        while ri + 32 <= len {
900            let input = _mm256_loadu_si256(sp.add(ri) as *const _);
901            let biased = _mm256_add_epi8(input, bias_v);
902            // gt = 0xFF where biased > threshold (OUT of range = KEEP)
903            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
904            // in_range = 0xFF where IN range (to DELETE), 0 where to KEEP
905            let in_range = _mm256_cmpeq_epi8(gt, zero);
906            // keep_mask bits: 1 = keep (NOT in range)
907            let keep_mask = !(_mm256_movemask_epi8(in_range) as u32);
908
909            // Compact: copy kept bytes using the mask
910            let mut mask = keep_mask;
911            while mask != 0 {
912                let bit = mask.trailing_zeros() as usize;
913                *dp.add(wp) = *sp.add(ri + bit);
914                wp += 1;
915                mask &= mask - 1; // clear lowest set bit
916            }
917            ri += 32;
918        }
919
920        // SSE2 tail for 16-byte remainder
921        if ri + 16 <= len {
922            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
923            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
924            let zero128 = _mm_setzero_si128();
925
926            let input = _mm_loadu_si128(sp.add(ri) as *const _);
927            let biased = _mm_add_epi8(input, bias_v128);
928            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
929            let in_range = _mm_cmpeq_epi8(gt, zero128);
930            let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
931
932            let mut mask = keep_mask;
933            while mask != 0 {
934                let bit = mask.trailing_zeros() as usize;
935                *dp.add(wp) = *sp.add(ri + bit);
936                wp += 1;
937                mask &= mask - 1;
938            }
939            ri += 16;
940        }
941
942        // Scalar tail
943        while ri < len {
944            let b = *sp.add(ri);
945            if b < lo || b > hi {
946                *dp.add(wp) = b;
947                wp += 1;
948            }
949            ri += 1;
950        }
951
952        wp
953    }
954}
955
956#[cfg(target_arch = "x86_64")]
957#[target_feature(enable = "sse2")]
958unsafe fn delete_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
959    use std::arch::x86_64::*;
960
961    unsafe {
962        let range = hi - lo;
963        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
964        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
965        let zero = _mm_setzero_si128();
966
967        let len = src.len();
968        let sp = src.as_ptr();
969        let dp = dst.as_mut_ptr();
970        let mut ri = 0;
971        let mut wp = 0;
972
973        while ri + 16 <= len {
974            let input = _mm_loadu_si128(sp.add(ri) as *const _);
975            let biased = _mm_add_epi8(input, bias_v);
976            let gt = _mm_cmpgt_epi8(biased, threshold_v);
977            let in_range = _mm_cmpeq_epi8(gt, zero);
978            let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
979
980            let mut mask = keep_mask;
981            while mask != 0 {
982                let bit = mask.trailing_zeros() as usize;
983                *dp.add(wp) = *sp.add(ri + bit);
984                wp += 1;
985                mask &= mask - 1;
986            }
987            ri += 16;
988        }
989
990        while ri < len {
991            let b = *sp.add(ri);
992            if b < lo || b > hi {
993                *dp.add(wp) = b;
994                wp += 1;
995            }
996            ri += 1;
997        }
998
999        wp
1000    }
1001}
1002
1003/// Scalar range delete fallback for non-x86_64.
1004#[cfg(not(target_arch = "x86_64"))]
1005fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
1006    let mut wp = 0;
1007    for &b in src {
1008        if b < lo || b > hi {
1009            dst[wp] = b;
1010            wp += 1;
1011        }
1012    }
1013    wp
1014}
1015
1016/// Streaming delete for contiguous byte ranges using SIMD range detection.
1017fn delete_range_streaming(
1018    lo: u8,
1019    hi: u8,
1020    reader: &mut impl Read,
1021    writer: &mut impl Write,
1022) -> io::Result<()> {
1023    let mut src = vec![0u8; STREAM_BUF];
1024    let mut dst = alloc_uninit_vec(STREAM_BUF);
1025    loop {
1026        let n = read_full(reader, &mut src)?;
1027        if n == 0 {
1028            break;
1029        }
1030        let wp = delete_range_chunk(&src[..n], &mut dst, lo, hi);
1031        if wp > 0 {
1032            writer.write_all(&dst[..wp])?;
1033        }
1034    }
1035    Ok(())
1036}
1037
1038// ============================================================================
1039// Streaming functions (Read + Write)
1040// ============================================================================
1041
1042pub fn translate(
1043    set1: &[u8],
1044    set2: &[u8],
1045    reader: &mut impl Read,
1046    writer: &mut impl Write,
1047) -> io::Result<()> {
1048    let table = build_translate_table(set1, set2);
1049
1050    // Check for identity table — pure passthrough (no transformation needed)
1051    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
1052    if is_identity {
1053        return passthrough_stream(reader, writer);
1054    }
1055
1056    // Try SIMD fast path for range translations (in-place, single buffer)
1057    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
1058        return translate_range_stream(lo, hi, offset, reader, writer);
1059    }
1060
1061    // General case: IN-PLACE translation on a SINGLE 4MB buffer.
1062    // This halves memory bandwidth vs the old separate src/dst approach:
1063    // - Old: read into src (4MB), translate from src→dst (read 4MB + write 4MB), write dst (4MB) = 12MB
1064    // - New: read into buf (4MB), translate in-place (read+write 4MB), write buf (4MB) = 8MB
1065    // The 8x-unrolled in-place translate avoids store-to-load forwarding stalls
1066    // because consecutive reads are 8 bytes apart (sequential), not aliased.
1067    // Using 4MB buffer (vs 1MB) reduces syscall count from 10 to 3 for 10MB.
1068    let mut buf = vec![0u8; TRANSLATE_STREAM_BUF];
1069    loop {
1070        let n = read_full(reader, &mut buf)?;
1071        if n == 0 {
1072            break;
1073        }
1074        translate_inplace(&mut buf[..n], &table);
1075        writer.write_all(&buf[..n])?;
1076    }
1077    Ok(())
1078}
1079
1080/// Streaming SIMD range translation — single buffer, in-place transform.
1081/// Uses 4MB buffer for fewer syscalls (translate is compute-light).
1082fn translate_range_stream(
1083    lo: u8,
1084    hi: u8,
1085    offset: i8,
1086    reader: &mut impl Read,
1087    writer: &mut impl Write,
1088) -> io::Result<()> {
1089    let mut buf = vec![0u8; TRANSLATE_STREAM_BUF];
1090    loop {
1091        let n = read_full(reader, &mut buf)?;
1092        if n == 0 {
1093            break;
1094        }
1095        translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
1096        writer.write_all(&buf[..n])?;
1097    }
1098    Ok(())
1099}
1100
1101/// Pure passthrough: copy stdin to stdout without transformation.
1102/// Uses a single 4MB buffer with direct read/write, no processing overhead.
1103fn passthrough_stream(reader: &mut impl Read, writer: &mut impl Write) -> io::Result<()> {
1104    let mut buf = vec![0u8; TRANSLATE_STREAM_BUF];
1105    loop {
1106        let n = read_full(reader, &mut buf)?;
1107        if n == 0 {
1108            break;
1109        }
1110        writer.write_all(&buf[..n])?;
1111    }
1112    Ok(())
1113}
1114
1115/// Read as many bytes as possible into buf, retrying on partial reads.
1116#[inline]
1117fn read_full(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
1118    let mut total = 0;
1119    while total < buf.len() {
1120        match reader.read(&mut buf[total..]) {
1121            Ok(0) => break,
1122            Ok(n) => total += n,
1123            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
1124            Err(e) => return Err(e),
1125        }
1126    }
1127    Ok(total)
1128}
1129
1130pub fn translate_squeeze(
1131    set1: &[u8],
1132    set2: &[u8],
1133    reader: &mut impl Read,
1134    writer: &mut impl Write,
1135) -> io::Result<()> {
1136    let table = build_translate_table(set1, set2);
1137    let squeeze_set = build_member_set(set2);
1138
1139    // Two-pass optimization for range translations:
1140    // Pass 1: SIMD range translate in-place (10x faster than scalar table lookup)
1141    // Pass 2: scalar squeeze (inherently sequential due to state dependency)
1142    // Even though it's two passes, the translate pass is so much faster with SIMD
1143    // that the total is still a net win.
1144    let range_info = detect_range_offset(&table);
1145
1146    let mut buf = vec![0u8; STREAM_BUF];
1147    let mut last_squeezed: u16 = 256;
1148
1149    loop {
1150        let n = read_full(reader, &mut buf)?;
1151        if n == 0 {
1152            break;
1153        }
1154        // Pass 1: translate
1155        if let Some((lo, hi, offset)) = range_info {
1156            translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
1157        } else {
1158            translate_inplace(&mut buf[..n], &table);
1159        }
1160        // Pass 2: squeeze in-place
1161        let mut wp = 0;
1162        unsafe {
1163            let ptr = buf.as_mut_ptr();
1164            for i in 0..n {
1165                let b = *ptr.add(i);
1166                if is_member(&squeeze_set, b) {
1167                    if last_squeezed == b as u16 {
1168                        continue;
1169                    }
1170                    last_squeezed = b as u16;
1171                } else {
1172                    last_squeezed = 256;
1173                }
1174                *ptr.add(wp) = b;
1175                wp += 1;
1176            }
1177        }
1178        writer.write_all(&buf[..wp])?;
1179    }
1180    Ok(())
1181}
1182
1183pub fn delete(
1184    delete_chars: &[u8],
1185    reader: &mut impl Read,
1186    writer: &mut impl Write,
1187) -> io::Result<()> {
1188    if delete_chars.len() == 1 {
1189        return delete_single_streaming(delete_chars[0], reader, writer);
1190    }
1191    if delete_chars.len() <= 3 {
1192        return delete_multi_streaming(delete_chars, reader, writer);
1193    }
1194
1195    // SIMD fast path: if all delete chars form a contiguous range [lo..=hi],
1196    // use vectorized range comparison instead of scalar bitset lookup.
1197    // This covers [:digit:] (0x30-0x39), a-z, A-Z, etc.
1198    if let Some((lo, hi)) = detect_delete_range(delete_chars) {
1199        return delete_range_streaming(lo, hi, reader, writer);
1200    }
1201
1202    let member = build_member_set(delete_chars);
1203    let mut buf = vec![0u8; STREAM_BUF];
1204
1205    loop {
1206        let n = read_full(reader, &mut buf)?;
1207        if n == 0 {
1208            break;
1209        }
1210        let mut wp = 0;
1211        unsafe {
1212            let ptr = buf.as_mut_ptr();
1213            let mut i = 0;
1214            while i + 8 <= n {
1215                let b0 = *ptr.add(i);
1216                let b1 = *ptr.add(i + 1);
1217                let b2 = *ptr.add(i + 2);
1218                let b3 = *ptr.add(i + 3);
1219                let b4 = *ptr.add(i + 4);
1220                let b5 = *ptr.add(i + 5);
1221                let b6 = *ptr.add(i + 6);
1222                let b7 = *ptr.add(i + 7);
1223
1224                if !is_member(&member, b0) {
1225                    *ptr.add(wp) = b0;
1226                    wp += 1;
1227                }
1228                if !is_member(&member, b1) {
1229                    *ptr.add(wp) = b1;
1230                    wp += 1;
1231                }
1232                if !is_member(&member, b2) {
1233                    *ptr.add(wp) = b2;
1234                    wp += 1;
1235                }
1236                if !is_member(&member, b3) {
1237                    *ptr.add(wp) = b3;
1238                    wp += 1;
1239                }
1240                if !is_member(&member, b4) {
1241                    *ptr.add(wp) = b4;
1242                    wp += 1;
1243                }
1244                if !is_member(&member, b5) {
1245                    *ptr.add(wp) = b5;
1246                    wp += 1;
1247                }
1248                if !is_member(&member, b6) {
1249                    *ptr.add(wp) = b6;
1250                    wp += 1;
1251                }
1252                if !is_member(&member, b7) {
1253                    *ptr.add(wp) = b7;
1254                    wp += 1;
1255                }
1256                i += 8;
1257            }
1258            while i < n {
1259                let b = *ptr.add(i);
1260                if !is_member(&member, b) {
1261                    *ptr.add(wp) = b;
1262                    wp += 1;
1263                }
1264                i += 1;
1265            }
1266        }
1267        writer.write_all(&buf[..wp])?;
1268    }
1269    Ok(())
1270}
1271
1272fn delete_single_streaming(
1273    ch: u8,
1274    reader: &mut impl Read,
1275    writer: &mut impl Write,
1276) -> io::Result<()> {
1277    let mut buf = vec![0u8; STREAM_BUF];
1278    loop {
1279        let n = match reader.read(&mut buf) {
1280            Ok(0) => break,
1281            Ok(n) => n,
1282            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
1283            Err(e) => return Err(e),
1284        };
1285        let mut wp = 0;
1286        let mut i = 0;
1287        while i < n {
1288            match memchr::memchr(ch, &buf[i..n]) {
1289                Some(offset) => {
1290                    if offset > 0 {
1291                        if wp != i {
1292                            unsafe {
1293                                std::ptr::copy(
1294                                    buf.as_ptr().add(i),
1295                                    buf.as_mut_ptr().add(wp),
1296                                    offset,
1297                                );
1298                            }
1299                        }
1300                        wp += offset;
1301                    }
1302                    i += offset + 1;
1303                }
1304                None => {
1305                    let run_len = n - i;
1306                    if run_len > 0 {
1307                        if wp != i {
1308                            unsafe {
1309                                std::ptr::copy(
1310                                    buf.as_ptr().add(i),
1311                                    buf.as_mut_ptr().add(wp),
1312                                    run_len,
1313                                );
1314                            }
1315                        }
1316                        wp += run_len;
1317                    }
1318                    break;
1319                }
1320            }
1321        }
1322        writer.write_all(&buf[..wp])?;
1323    }
1324    Ok(())
1325}
1326
1327fn delete_multi_streaming(
1328    chars: &[u8],
1329    reader: &mut impl Read,
1330    writer: &mut impl Write,
1331) -> io::Result<()> {
1332    let mut buf = vec![0u8; STREAM_BUF];
1333    loop {
1334        let n = match reader.read(&mut buf) {
1335            Ok(0) => break,
1336            Ok(n) => n,
1337            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
1338            Err(e) => return Err(e),
1339        };
1340        let mut wp = 0;
1341        let mut i = 0;
1342        while i < n {
1343            let found = if chars.len() == 2 {
1344                memchr::memchr2(chars[0], chars[1], &buf[i..n])
1345            } else {
1346                memchr::memchr3(chars[0], chars[1], chars[2], &buf[i..n])
1347            };
1348            match found {
1349                Some(offset) => {
1350                    if offset > 0 {
1351                        if wp != i {
1352                            unsafe {
1353                                std::ptr::copy(
1354                                    buf.as_ptr().add(i),
1355                                    buf.as_mut_ptr().add(wp),
1356                                    offset,
1357                                );
1358                            }
1359                        }
1360                        wp += offset;
1361                    }
1362                    i += offset + 1;
1363                }
1364                None => {
1365                    let run_len = n - i;
1366                    if run_len > 0 {
1367                        if wp != i {
1368                            unsafe {
1369                                std::ptr::copy(
1370                                    buf.as_ptr().add(i),
1371                                    buf.as_mut_ptr().add(wp),
1372                                    run_len,
1373                                );
1374                            }
1375                        }
1376                        wp += run_len;
1377                    }
1378                    break;
1379                }
1380            }
1381        }
1382        writer.write_all(&buf[..wp])?;
1383    }
1384    Ok(())
1385}
1386
1387pub fn delete_squeeze(
1388    delete_chars: &[u8],
1389    squeeze_chars: &[u8],
1390    reader: &mut impl Read,
1391    writer: &mut impl Write,
1392) -> io::Result<()> {
1393    let delete_set = build_member_set(delete_chars);
1394    let squeeze_set = build_member_set(squeeze_chars);
1395    let mut buf = vec![0u8; STREAM_BUF];
1396    let mut last_squeezed: u16 = 256;
1397
1398    loop {
1399        let n = read_full(reader, &mut buf)?;
1400        if n == 0 {
1401            break;
1402        }
1403        let mut wp = 0;
1404        unsafe {
1405            let ptr = buf.as_mut_ptr();
1406            for i in 0..n {
1407                let b = *ptr.add(i);
1408                if is_member(&delete_set, b) {
1409                    continue;
1410                }
1411                if is_member(&squeeze_set, b) {
1412                    if last_squeezed == b as u16 {
1413                        continue;
1414                    }
1415                    last_squeezed = b as u16;
1416                } else {
1417                    last_squeezed = 256;
1418                }
1419                *ptr.add(wp) = b;
1420                wp += 1;
1421            }
1422        }
1423        writer.write_all(&buf[..wp])?;
1424    }
1425    Ok(())
1426}
1427
1428pub fn squeeze(
1429    squeeze_chars: &[u8],
1430    reader: &mut impl Read,
1431    writer: &mut impl Write,
1432) -> io::Result<()> {
1433    if squeeze_chars.len() == 1 {
1434        return squeeze_single_stream(squeeze_chars[0], reader, writer);
1435    }
1436
1437    let member = build_member_set(squeeze_chars);
1438    let mut buf = vec![0u8; STREAM_BUF];
1439    let mut last_squeezed: u16 = 256;
1440
1441    loop {
1442        let n = read_full(reader, &mut buf)?;
1443        if n == 0 {
1444            break;
1445        }
1446        let mut wp = 0;
1447        unsafe {
1448            let ptr = buf.as_mut_ptr();
1449            for i in 0..n {
1450                let b = *ptr.add(i);
1451                if is_member(&member, b) {
1452                    if last_squeezed == b as u16 {
1453                        continue;
1454                    }
1455                    last_squeezed = b as u16;
1456                } else {
1457                    last_squeezed = 256;
1458                }
1459                *ptr.add(wp) = b;
1460                wp += 1;
1461            }
1462        }
1463        writer.write_all(&buf[..wp])?;
1464    }
1465    Ok(())
1466}
1467
1468fn squeeze_single_stream(
1469    ch: u8,
1470    reader: &mut impl Read,
1471    writer: &mut impl Write,
1472) -> io::Result<()> {
1473    let mut buf = vec![0u8; STREAM_BUF];
1474    let mut was_squeeze_char = false;
1475
1476    loop {
1477        let n = match reader.read(&mut buf) {
1478            Ok(0) => break,
1479            Ok(n) => n,
1480            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
1481            Err(e) => return Err(e),
1482        };
1483
1484        let mut wp = 0;
1485        let mut i = 0;
1486
1487        while i < n {
1488            if was_squeeze_char && buf[i] == ch {
1489                i += 1;
1490                while i < n && buf[i] == ch {
1491                    i += 1;
1492                }
1493                if i >= n {
1494                    break;
1495                }
1496            }
1497
1498            match memchr::memchr(ch, &buf[i..n]) {
1499                Some(offset) => {
1500                    let run_len = offset;
1501                    if run_len > 0 {
1502                        if wp != i {
1503                            unsafe {
1504                                std::ptr::copy(
1505                                    buf.as_ptr().add(i),
1506                                    buf.as_mut_ptr().add(wp),
1507                                    run_len,
1508                                );
1509                            }
1510                        }
1511                        wp += run_len;
1512                    }
1513                    i += run_len;
1514
1515                    unsafe {
1516                        *buf.as_mut_ptr().add(wp) = ch;
1517                    }
1518                    wp += 1;
1519                    was_squeeze_char = true;
1520                    i += 1;
1521                    while i < n && buf[i] == ch {
1522                        i += 1;
1523                    }
1524                }
1525                None => {
1526                    let run_len = n - i;
1527                    if run_len > 0 {
1528                        if wp != i {
1529                            unsafe {
1530                                std::ptr::copy(
1531                                    buf.as_ptr().add(i),
1532                                    buf.as_mut_ptr().add(wp),
1533                                    run_len,
1534                                );
1535                            }
1536                        }
1537                        wp += run_len;
1538                    }
1539                    was_squeeze_char = false;
1540                    break;
1541                }
1542            }
1543        }
1544
1545        writer.write_all(&buf[..wp])?;
1546    }
1547    Ok(())
1548}
1549
1550// ============================================================================
1551// Mmap-based functions (zero-copy input from byte slice)
1552// ============================================================================
1553
1554/// Maximum data size for single-allocation translate approach.
1555/// Below this limit, translate ALL data into one buffer and do a single write_all.
1556/// Above this, use chunked approach to limit memory usage.
1557const SINGLE_WRITE_LIMIT: usize = 16 * 1024 * 1024;
1558
1559/// Translate bytes from an mmap'd byte slice.
1560/// Detects single-range translations (e.g., a-z to A-Z) and uses SIMD vectorized
1561/// arithmetic (AVX2: 32 bytes/iter, SSE2: 16 bytes/iter) for those cases.
1562/// Falls back to scalar 256-byte table lookup for general translations.
1563///
1564/// For data >= 2MB: uses rayon parallel processing across multiple cores.
1565/// For data <= 16MB: single allocation + single write_all (1 syscall).
1566/// For data > 16MB: chunked approach to limit memory (N syscalls where N = data/4MB).
1567pub fn translate_mmap(
1568    set1: &[u8],
1569    set2: &[u8],
1570    data: &[u8],
1571    writer: &mut impl Write,
1572) -> io::Result<()> {
1573    let table = build_translate_table(set1, set2);
1574
1575    // Check if table is identity — pure passthrough
1576    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
1577    if is_identity {
1578        return writer.write_all(data);
1579    }
1580
1581    // Try SIMD fast path for single-range constant-offset translations
1582    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
1583        return translate_mmap_range(data, writer, lo, hi, offset);
1584    }
1585
1586    // General case: table lookup (with parallel processing for large data)
1587    translate_mmap_table(data, writer, &table)
1588}
1589
1590/// SIMD range translate for mmap data, with rayon parallel processing.
1591fn translate_mmap_range(
1592    data: &[u8],
1593    writer: &mut impl Write,
1594    lo: u8,
1595    hi: u8,
1596    offset: i8,
1597) -> io::Result<()> {
1598    // Parallel path: split data into chunks, translate each in parallel
1599    if data.len() >= PARALLEL_THRESHOLD {
1600        let mut buf = alloc_uninit_vec(data.len());
1601        let n_threads = rayon::current_num_threads().max(1);
1602        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1603
1604        // Process chunks in parallel: each thread writes to its slice of buf
1605        data.par_chunks(chunk_size)
1606            .zip(buf.par_chunks_mut(chunk_size))
1607            .for_each(|(src_chunk, dst_chunk)| {
1608                translate_range_simd(src_chunk, &mut dst_chunk[..src_chunk.len()], lo, hi, offset);
1609            });
1610
1611        return writer.write_all(&buf);
1612    }
1613
1614    // Small data: single-threaded SIMD
1615    if data.len() <= SINGLE_WRITE_LIMIT {
1616        let mut buf = alloc_uninit_vec(data.len());
1617        translate_range_simd(data, &mut buf, lo, hi, offset);
1618        return writer.write_all(&buf);
1619    }
1620    // Chunked path for large data (shouldn't happen since PARALLEL_THRESHOLD < SINGLE_WRITE_LIMIT)
1621    let mut buf = alloc_uninit_vec(BUF_SIZE);
1622    for chunk in data.chunks(BUF_SIZE) {
1623        translate_range_simd(chunk, &mut buf[..chunk.len()], lo, hi, offset);
1624        writer.write_all(&buf[..chunk.len()])?;
1625    }
1626    Ok(())
1627}
1628
1629/// General table-lookup translate for mmap data, with rayon parallel processing.
1630fn translate_mmap_table(data: &[u8], writer: &mut impl Write, table: &[u8; 256]) -> io::Result<()> {
1631    // Parallel path: split data into chunks, translate each in parallel
1632    if data.len() >= PARALLEL_THRESHOLD {
1633        let mut buf = alloc_uninit_vec(data.len());
1634        let n_threads = rayon::current_num_threads().max(1);
1635        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1636
1637        data.par_chunks(chunk_size)
1638            .zip(buf.par_chunks_mut(chunk_size))
1639            .for_each(|(src_chunk, dst_chunk)| {
1640                translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], table);
1641            });
1642
1643        return writer.write_all(&buf);
1644    }
1645
1646    // Small data: single-threaded
1647    if data.len() <= SINGLE_WRITE_LIMIT {
1648        let mut buf = alloc_uninit_vec(data.len());
1649        translate_to(data, &mut buf, table);
1650        return writer.write_all(&buf);
1651    }
1652    let mut buf = alloc_uninit_vec(BUF_SIZE);
1653    for chunk in data.chunks(BUF_SIZE) {
1654        translate_to(chunk, &mut buf[..chunk.len()], table);
1655        writer.write_all(&buf[..chunk.len()])?;
1656    }
1657    Ok(())
1658}
1659
1660/// Translate + squeeze from mmap'd byte slice.
1661///
1662/// For data >= 2MB: two-phase approach: parallel translate, then sequential squeeze.
1663/// For data <= 16MB: single-pass translate+squeeze into one buffer, one write syscall.
1664/// For data > 16MB: chunked approach to limit memory.
1665pub fn translate_squeeze_mmap(
1666    set1: &[u8],
1667    set2: &[u8],
1668    data: &[u8],
1669    writer: &mut impl Write,
1670) -> io::Result<()> {
1671    let table = build_translate_table(set1, set2);
1672    let squeeze_set = build_member_set(set2);
1673
1674    // For large data: two-phase approach
1675    // Phase 1: parallel translate into buffer
1676    // Phase 2: sequential squeeze IN-PLACE on the translated buffer
1677    //          (squeeze only removes bytes, never grows, so no second allocation needed)
1678    if data.len() >= PARALLEL_THRESHOLD {
1679        // Phase 1: parallel translate
1680        let mut translated = alloc_uninit_vec(data.len());
1681        let range_info = detect_range_offset(&table);
1682        let n_threads = rayon::current_num_threads().max(1);
1683        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1684
1685        if let Some((lo, hi, offset)) = range_info {
1686            data.par_chunks(chunk_size)
1687                .zip(translated.par_chunks_mut(chunk_size))
1688                .for_each(|(src_chunk, dst_chunk)| {
1689                    translate_range_simd(
1690                        src_chunk,
1691                        &mut dst_chunk[..src_chunk.len()],
1692                        lo,
1693                        hi,
1694                        offset,
1695                    );
1696                });
1697        } else {
1698            data.par_chunks(chunk_size)
1699                .zip(translated.par_chunks_mut(chunk_size))
1700                .for_each(|(src_chunk, dst_chunk)| {
1701                    translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], &table);
1702                });
1703        }
1704
1705        // Phase 2: squeeze in-place on the translated buffer.
1706        // Since squeeze only removes bytes (never grows), we can read ahead and
1707        // compact into the same buffer, saving a full data.len() heap allocation.
1708        let mut last_squeezed: u16 = 256;
1709        let len = translated.len();
1710        let mut wp = 0;
1711        unsafe {
1712            let ptr = translated.as_mut_ptr();
1713            let mut i = 0;
1714            while i < len {
1715                let b = *ptr.add(i);
1716                if is_member(&squeeze_set, b) {
1717                    if last_squeezed == b as u16 {
1718                        i += 1;
1719                        continue;
1720                    }
1721                    last_squeezed = b as u16;
1722                } else {
1723                    last_squeezed = 256;
1724                }
1725                *ptr.add(wp) = b;
1726                wp += 1;
1727                i += 1;
1728            }
1729        }
1730        return writer.write_all(&translated[..wp]);
1731    }
1732
1733    // Single-write fast path: translate+squeeze all data in one pass, one write
1734    if data.len() <= SINGLE_WRITE_LIMIT {
1735        let mut buf: Vec<u8> = Vec::with_capacity(data.len());
1736        let mut last_squeezed: u16 = 256;
1737        unsafe {
1738            buf.set_len(data.len());
1739            let outp: *mut u8 = buf.as_mut_ptr();
1740            let inp = data.as_ptr();
1741            let len = data.len();
1742            let mut wp = 0;
1743            let mut i = 0;
1744            while i < len {
1745                let translated = *table.get_unchecked(*inp.add(i) as usize);
1746                if is_member(&squeeze_set, translated) {
1747                    if last_squeezed == translated as u16 {
1748                        i += 1;
1749                        continue;
1750                    }
1751                    last_squeezed = translated as u16;
1752                } else {
1753                    last_squeezed = 256;
1754                }
1755                *outp.add(wp) = translated;
1756                wp += 1;
1757                i += 1;
1758            }
1759            buf.set_len(wp);
1760        }
1761        return writer.write_all(&buf);
1762    }
1763
1764    // Chunked path for large data
1765    let buf_size = data.len().min(BUF_SIZE);
1766    let mut buf = vec![0u8; buf_size];
1767    let mut last_squeezed: u16 = 256;
1768
1769    for chunk in data.chunks(buf_size) {
1770        translate_to(chunk, &mut buf[..chunk.len()], &table);
1771        let mut wp = 0;
1772        unsafe {
1773            let ptr = buf.as_mut_ptr();
1774            for i in 0..chunk.len() {
1775                let b = *ptr.add(i);
1776                if is_member(&squeeze_set, b) {
1777                    if last_squeezed == b as u16 {
1778                        continue;
1779                    }
1780                    last_squeezed = b as u16;
1781                } else {
1782                    last_squeezed = 256;
1783                }
1784                *ptr.add(wp) = b;
1785                wp += 1;
1786            }
1787        }
1788        writer.write_all(&buf[..wp])?;
1789    }
1790    Ok(())
1791}
1792
1793/// Delete from mmap'd byte slice.
1794///
1795/// For data >= 2MB: uses rayon parallel processing across multiple cores.
1796/// For data <= 16MB: delete into one buffer, one write syscall.
1797/// For data > 16MB: chunked approach to limit memory.
1798pub fn delete_mmap(delete_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1799    if delete_chars.len() == 1 {
1800        return delete_single_char_mmap(delete_chars[0], data, writer);
1801    }
1802    if delete_chars.len() <= 3 {
1803        return delete_multi_memchr_mmap(delete_chars, data, writer);
1804    }
1805
1806    // SIMD fast path for contiguous ranges (digits, a-z, A-Z, etc.)
1807    if let Some((lo, hi)) = detect_delete_range(delete_chars) {
1808        return delete_range_mmap(data, writer, lo, hi);
1809    }
1810
1811    let member = build_member_set(delete_chars);
1812
1813    // Parallel path: pre-allocate a single output buffer of data.len() and have each
1814    // thread write to its non-overlapping slice, then do a single write_all.
1815    // This avoids per-chunk Vec allocations that the old approach had.
1816    if data.len() >= PARALLEL_THRESHOLD {
1817        let n_threads = rayon::current_num_threads().max(1);
1818        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1819
1820        // Each thread deletes into its slice of outbuf and returns bytes written.
1821        let mut outbuf = alloc_uninit_vec(data.len());
1822        let chunk_lens: Vec<usize> = data
1823            .par_chunks(chunk_size)
1824            .zip(outbuf.par_chunks_mut(chunk_size))
1825            .map(|(src_chunk, dst_chunk)| delete_chunk_bitset_into(src_chunk, &member, dst_chunk))
1826            .collect();
1827
1828        // Compact: move each chunk's output to be contiguous.
1829        // chunk_lens[i] is how many bytes thread i wrote into its slice.
1830        // We need to shift them together since each dst_chunk started at chunk_size offsets.
1831        let mut write_pos = 0;
1832        let mut src_offset = 0;
1833        for &clen in &chunk_lens {
1834            if clen > 0 && src_offset != write_pos {
1835                unsafe {
1836                    std::ptr::copy(
1837                        outbuf.as_ptr().add(src_offset),
1838                        outbuf.as_mut_ptr().add(write_pos),
1839                        clen,
1840                    );
1841                }
1842            }
1843            write_pos += clen;
1844            src_offset += chunk_size;
1845        }
1846
1847        return writer.write_all(&outbuf[..write_pos]);
1848    }
1849
1850    // Single-write fast path: delete into one buffer, one write
1851    if data.len() <= SINGLE_WRITE_LIMIT {
1852        let mut outbuf = alloc_uninit_vec(data.len());
1853        let out_pos = delete_chunk_bitset_into(data, &member, &mut outbuf);
1854        return writer.write_all(&outbuf[..out_pos]);
1855    }
1856
1857    // Chunked path for large data
1858    let buf_size = data.len().min(BUF_SIZE);
1859    let mut outbuf = alloc_uninit_vec(buf_size);
1860
1861    for chunk in data.chunks(buf_size) {
1862        let out_pos = delete_chunk_bitset_into(chunk, &member, &mut outbuf);
1863        writer.write_all(&outbuf[..out_pos])?;
1864    }
1865    Ok(())
1866}
1867
1868/// SIMD range delete for mmap data, with rayon parallel processing.
1869fn delete_range_mmap(data: &[u8], writer: &mut impl Write, lo: u8, hi: u8) -> io::Result<()> {
1870    // Parallel path: each thread deletes from its chunk into a local Vec
1871    if data.len() >= PARALLEL_THRESHOLD {
1872        let n_threads = rayon::current_num_threads().max(1);
1873        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1874
1875        let results: Vec<Vec<u8>> = data
1876            .par_chunks(chunk_size)
1877            .map(|chunk| {
1878                let mut out = alloc_uninit_vec(chunk.len());
1879                let wp = delete_range_chunk(chunk, &mut out, lo, hi);
1880                unsafe { out.set_len(wp) };
1881                out
1882            })
1883            .collect();
1884
1885        let slices: Vec<std::io::IoSlice> = results
1886            .iter()
1887            .filter(|r| !r.is_empty())
1888            .map(|r| std::io::IoSlice::new(r))
1889            .collect();
1890        return write_ioslices(writer, &slices);
1891    }
1892
1893    // Single-write fast path
1894    if data.len() <= SINGLE_WRITE_LIMIT {
1895        let mut outbuf = alloc_uninit_vec(data.len());
1896        let wp = delete_range_chunk(data, &mut outbuf, lo, hi);
1897        return writer.write_all(&outbuf[..wp]);
1898    }
1899
1900    // Chunked path
1901    let mut outbuf = alloc_uninit_vec(BUF_SIZE);
1902    for chunk in data.chunks(BUF_SIZE) {
1903        let wp = delete_range_chunk(chunk, &mut outbuf, lo, hi);
1904        writer.write_all(&outbuf[..wp])?;
1905    }
1906    Ok(())
1907}
1908
1909/// Delete bytes from chunk using bitset, writing into pre-allocated buffer.
1910/// Returns number of bytes written.
1911#[inline]
1912fn delete_chunk_bitset_into(chunk: &[u8], member: &[u8; 32], outbuf: &mut [u8]) -> usize {
1913    let len = chunk.len();
1914    let mut out_pos = 0;
1915    let mut i = 0;
1916
1917    while i + 8 <= len {
1918        unsafe {
1919            let b0 = *chunk.get_unchecked(i);
1920            let b1 = *chunk.get_unchecked(i + 1);
1921            let b2 = *chunk.get_unchecked(i + 2);
1922            let b3 = *chunk.get_unchecked(i + 3);
1923            let b4 = *chunk.get_unchecked(i + 4);
1924            let b5 = *chunk.get_unchecked(i + 5);
1925            let b6 = *chunk.get_unchecked(i + 6);
1926            let b7 = *chunk.get_unchecked(i + 7);
1927
1928            *outbuf.get_unchecked_mut(out_pos) = b0;
1929            out_pos += !is_member(member, b0) as usize;
1930            *outbuf.get_unchecked_mut(out_pos) = b1;
1931            out_pos += !is_member(member, b1) as usize;
1932            *outbuf.get_unchecked_mut(out_pos) = b2;
1933            out_pos += !is_member(member, b2) as usize;
1934            *outbuf.get_unchecked_mut(out_pos) = b3;
1935            out_pos += !is_member(member, b3) as usize;
1936            *outbuf.get_unchecked_mut(out_pos) = b4;
1937            out_pos += !is_member(member, b4) as usize;
1938            *outbuf.get_unchecked_mut(out_pos) = b5;
1939            out_pos += !is_member(member, b5) as usize;
1940            *outbuf.get_unchecked_mut(out_pos) = b6;
1941            out_pos += !is_member(member, b6) as usize;
1942            *outbuf.get_unchecked_mut(out_pos) = b7;
1943            out_pos += !is_member(member, b7) as usize;
1944        }
1945        i += 8;
1946    }
1947
1948    while i < len {
1949        unsafe {
1950            let b = *chunk.get_unchecked(i);
1951            *outbuf.get_unchecked_mut(out_pos) = b;
1952            out_pos += !is_member(member, b) as usize;
1953        }
1954        i += 1;
1955    }
1956
1957    out_pos
1958}
1959
1960fn delete_single_char_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1961    // Parallel path for large data: each thread deletes from its chunk,
1962    // then use writev to write all results in one syscall batch.
1963    if data.len() >= PARALLEL_THRESHOLD {
1964        let n_threads = rayon::current_num_threads().max(1);
1965        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1966
1967        let results: Vec<Vec<u8>> = data
1968            .par_chunks(chunk_size)
1969            .map(|chunk| {
1970                let mut out = Vec::with_capacity(chunk.len());
1971                let mut last = 0;
1972                for pos in memchr::memchr_iter(ch, chunk) {
1973                    if pos > last {
1974                        out.extend_from_slice(&chunk[last..pos]);
1975                    }
1976                    last = pos + 1;
1977                }
1978                if last < chunk.len() {
1979                    out.extend_from_slice(&chunk[last..]);
1980                }
1981                out
1982            })
1983            .collect();
1984
1985        // Use writev to batch all results into fewer syscalls
1986        let slices: Vec<std::io::IoSlice> = results
1987            .iter()
1988            .filter(|r| !r.is_empty())
1989            .map(|r| std::io::IoSlice::new(r))
1990            .collect();
1991        return write_ioslices(writer, &slices);
1992    }
1993
1994    // Single-write fast path: collect all non-deleted spans into one buffer
1995    if data.len() <= SINGLE_WRITE_LIMIT {
1996        let mut outbuf = Vec::with_capacity(data.len());
1997        let mut last = 0;
1998        for pos in memchr::memchr_iter(ch, data) {
1999            if pos > last {
2000                outbuf.extend_from_slice(&data[last..pos]);
2001            }
2002            last = pos + 1;
2003        }
2004        if last < data.len() {
2005            outbuf.extend_from_slice(&data[last..]);
2006        }
2007        return writer.write_all(&outbuf);
2008    }
2009
2010    // Chunked path for large data
2011    let buf_size = data.len().min(BUF_SIZE);
2012    let mut outbuf = vec![0u8; buf_size];
2013
2014    for chunk in data.chunks(buf_size) {
2015        let mut wp = 0;
2016        let mut last = 0;
2017        for pos in memchr::memchr_iter(ch, chunk) {
2018            if pos > last {
2019                let run = pos - last;
2020                outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
2021                wp += run;
2022            }
2023            last = pos + 1;
2024        }
2025        if last < chunk.len() {
2026            let run = chunk.len() - last;
2027            outbuf[wp..wp + run].copy_from_slice(&chunk[last..]);
2028            wp += run;
2029        }
2030        writer.write_all(&outbuf[..wp])?;
2031    }
2032    Ok(())
2033}
2034
2035fn delete_multi_memchr_mmap(chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2036    let c0 = chars[0];
2037    let c1 = if chars.len() >= 2 { chars[1] } else { 0 };
2038    let c2 = if chars.len() >= 3 { chars[2] } else { 0 };
2039    let is_three = chars.len() >= 3;
2040
2041    // Parallel path for large data
2042    if data.len() >= PARALLEL_THRESHOLD {
2043        let n_threads = rayon::current_num_threads().max(1);
2044        let chunk_size = (data.len() / n_threads).max(32 * 1024);
2045
2046        let results: Vec<Vec<u8>> = data
2047            .par_chunks(chunk_size)
2048            .map(|chunk| {
2049                let mut out = Vec::with_capacity(chunk.len());
2050                let mut last = 0;
2051                if is_three {
2052                    for pos in memchr::memchr3_iter(c0, c1, c2, chunk) {
2053                        if pos > last {
2054                            out.extend_from_slice(&chunk[last..pos]);
2055                        }
2056                        last = pos + 1;
2057                    }
2058                } else {
2059                    for pos in memchr::memchr2_iter(c0, c1, chunk) {
2060                        if pos > last {
2061                            out.extend_from_slice(&chunk[last..pos]);
2062                        }
2063                        last = pos + 1;
2064                    }
2065                }
2066                if last < chunk.len() {
2067                    out.extend_from_slice(&chunk[last..]);
2068                }
2069                out
2070            })
2071            .collect();
2072
2073        // Use writev to batch all results into fewer syscalls
2074        let slices: Vec<std::io::IoSlice> = results
2075            .iter()
2076            .filter(|r| !r.is_empty())
2077            .map(|r| std::io::IoSlice::new(r))
2078            .collect();
2079        return write_ioslices(writer, &slices);
2080    }
2081
2082    // Single-write fast path: collect all non-deleted spans into one buffer
2083    if data.len() <= SINGLE_WRITE_LIMIT {
2084        let mut outbuf = Vec::with_capacity(data.len());
2085        let mut last = 0;
2086        if is_three {
2087            for pos in memchr::memchr3_iter(c0, c1, c2, data) {
2088                if pos > last {
2089                    outbuf.extend_from_slice(&data[last..pos]);
2090                }
2091                last = pos + 1;
2092            }
2093        } else {
2094            for pos in memchr::memchr2_iter(c0, c1, data) {
2095                if pos > last {
2096                    outbuf.extend_from_slice(&data[last..pos]);
2097                }
2098                last = pos + 1;
2099            }
2100        }
2101        if last < data.len() {
2102            outbuf.extend_from_slice(&data[last..]);
2103        }
2104        return writer.write_all(&outbuf);
2105    }
2106
2107    // Chunked path for large data
2108    let buf_size = data.len().min(BUF_SIZE);
2109    let mut outbuf = vec![0u8; buf_size];
2110
2111    for chunk in data.chunks(buf_size) {
2112        let mut wp = 0;
2113        let mut last = 0;
2114
2115        // Iterate directly over memchr iterator without collecting into Vec<usize>.
2116        // Positions are used exactly once in order, so no intermediate allocation needed.
2117        if is_three {
2118            for pos in memchr::memchr3_iter(c0, c1, c2, chunk) {
2119                if pos > last {
2120                    let run = pos - last;
2121                    outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
2122                    wp += run;
2123                }
2124                last = pos + 1;
2125            }
2126        } else {
2127            for pos in memchr::memchr2_iter(c0, c1, chunk) {
2128                if pos > last {
2129                    let run = pos - last;
2130                    outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
2131                    wp += run;
2132                }
2133                last = pos + 1;
2134            }
2135        }
2136
2137        if last < chunk.len() {
2138            let run = chunk.len() - last;
2139            outbuf[wp..wp + run].copy_from_slice(&chunk[last..]);
2140            wp += run;
2141        }
2142        writer.write_all(&outbuf[..wp])?;
2143    }
2144    Ok(())
2145}
2146
2147/// Delete + squeeze from mmap'd byte slice.
2148///
2149/// For data <= 16MB: delete+squeeze into one buffer, one write syscall.
2150/// For data > 16MB: chunked approach to limit memory.
2151pub fn delete_squeeze_mmap(
2152    delete_chars: &[u8],
2153    squeeze_chars: &[u8],
2154    data: &[u8],
2155    writer: &mut impl Write,
2156) -> io::Result<()> {
2157    let delete_set = build_member_set(delete_chars);
2158    let squeeze_set = build_member_set(squeeze_chars);
2159
2160    // Single-write fast path: delete+squeeze all data in one pass, one write
2161    if data.len() <= SINGLE_WRITE_LIMIT {
2162        let mut outbuf: Vec<u8> = Vec::with_capacity(data.len());
2163        let mut last_squeezed: u16 = 256;
2164        unsafe {
2165            outbuf.set_len(data.len());
2166            let outp: *mut u8 = outbuf.as_mut_ptr();
2167            let inp = data.as_ptr();
2168            let len = data.len();
2169            let mut out_pos = 0;
2170            let mut i = 0;
2171            while i < len {
2172                let b = *inp.add(i);
2173                if is_member(&delete_set, b) {
2174                    i += 1;
2175                    continue;
2176                }
2177                if is_member(&squeeze_set, b) {
2178                    if last_squeezed == b as u16 {
2179                        i += 1;
2180                        continue;
2181                    }
2182                    last_squeezed = b as u16;
2183                } else {
2184                    last_squeezed = 256;
2185                }
2186                *outp.add(out_pos) = b;
2187                out_pos += 1;
2188                i += 1;
2189            }
2190            outbuf.set_len(out_pos);
2191        }
2192        return writer.write_all(&outbuf);
2193    }
2194
2195    // Chunked path for large data
2196    let buf_size = data.len().min(BUF_SIZE);
2197    let mut outbuf = vec![0u8; buf_size];
2198    let mut last_squeezed: u16 = 256;
2199
2200    for chunk in data.chunks(buf_size) {
2201        let mut out_pos = 0;
2202        for &b in chunk {
2203            if is_member(&delete_set, b) {
2204                continue;
2205            }
2206            if is_member(&squeeze_set, b) {
2207                if last_squeezed == b as u16 {
2208                    continue;
2209                }
2210                last_squeezed = b as u16;
2211            } else {
2212                last_squeezed = 256;
2213            }
2214            unsafe {
2215                *outbuf.get_unchecked_mut(out_pos) = b;
2216            }
2217            out_pos += 1;
2218        }
2219        writer.write_all(&outbuf[..out_pos])?;
2220    }
2221    Ok(())
2222}
2223
2224/// Squeeze from mmap'd byte slice.
2225///
2226/// For data >= 2MB: uses rayon parallel processing with boundary fixup.
2227/// For data <= 16MB: squeeze into one buffer, one write syscall.
2228/// For data > 16MB: chunked approach to limit memory.
2229pub fn squeeze_mmap(squeeze_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2230    if squeeze_chars.len() == 1 {
2231        return squeeze_single_mmap(squeeze_chars[0], data, writer);
2232    }
2233    if squeeze_chars.len() == 2 {
2234        return squeeze_multi_mmap::<2>(squeeze_chars, data, writer);
2235    }
2236    if squeeze_chars.len() == 3 {
2237        return squeeze_multi_mmap::<3>(squeeze_chars, data, writer);
2238    }
2239
2240    let member = build_member_set(squeeze_chars);
2241
2242    // Parallel path: squeeze each chunk independently, then fix boundaries
2243    if data.len() >= PARALLEL_THRESHOLD {
2244        let n_threads = rayon::current_num_threads().max(1);
2245        let chunk_size = (data.len() / n_threads).max(32 * 1024);
2246
2247        let results: Vec<Vec<u8>> = data
2248            .par_chunks(chunk_size)
2249            .map(|chunk| squeeze_chunk_bitset(chunk, &member))
2250            .collect();
2251
2252        // Build IoSlice list, fixing boundaries: if chunk N ends with byte B
2253        // and chunk N+1 starts with same byte B, and B is in squeeze set,
2254        // skip the first byte(s) of chunk N+1 that equal B.
2255        // Collect slices for writev to minimize syscalls.
2256        let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
2257        for (idx, result) in results.iter().enumerate() {
2258            if result.is_empty() {
2259                continue;
2260            }
2261            if idx > 0 {
2262                // Check boundary: does previous chunk end with same squeezable byte?
2263                if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
2264                    if is_member(&member, prev_last) {
2265                        // Skip leading bytes in this chunk that equal prev_last
2266                        let skip = result.iter().take_while(|&&b| b == prev_last).count();
2267                        if skip < result.len() {
2268                            slices.push(std::io::IoSlice::new(&result[skip..]));
2269                        }
2270                        continue;
2271                    }
2272                }
2273            }
2274            slices.push(std::io::IoSlice::new(result));
2275        }
2276        return write_ioslices(writer, &slices);
2277    }
2278
2279    // Single-write fast path: squeeze all data into one buffer, one write
2280    if data.len() <= SINGLE_WRITE_LIMIT {
2281        let mut outbuf: Vec<u8> = Vec::with_capacity(data.len());
2282        let mut last_squeezed: u16 = 256;
2283        let len = data.len();
2284        let mut wp = 0;
2285        let mut i = 0;
2286
2287        unsafe {
2288            outbuf.set_len(data.len());
2289            let inp = data.as_ptr();
2290            let outp: *mut u8 = outbuf.as_mut_ptr();
2291
2292            while i < len {
2293                let b = *inp.add(i);
2294                if is_member(&member, b) {
2295                    if last_squeezed != b as u16 {
2296                        *outp.add(wp) = b;
2297                        wp += 1;
2298                        last_squeezed = b as u16;
2299                    }
2300                    i += 1;
2301                    while i < len && *inp.add(i) == b {
2302                        i += 1;
2303                    }
2304                } else {
2305                    last_squeezed = 256;
2306                    *outp.add(wp) = b;
2307                    wp += 1;
2308                    i += 1;
2309                }
2310            }
2311            outbuf.set_len(wp);
2312        }
2313        return writer.write_all(&outbuf);
2314    }
2315
2316    // Chunked path for large data
2317    let buf_size = data.len().min(BUF_SIZE);
2318    let mut outbuf = vec![0u8; buf_size];
2319    let mut last_squeezed: u16 = 256;
2320
2321    for chunk in data.chunks(buf_size) {
2322        let len = chunk.len();
2323        let mut wp = 0;
2324        let mut i = 0;
2325
2326        unsafe {
2327            let inp = chunk.as_ptr();
2328            let outp = outbuf.as_mut_ptr();
2329
2330            while i < len {
2331                let b = *inp.add(i);
2332                if is_member(&member, b) {
2333                    if last_squeezed != b as u16 {
2334                        *outp.add(wp) = b;
2335                        wp += 1;
2336                        last_squeezed = b as u16;
2337                    }
2338                    i += 1;
2339                    while i < len && *inp.add(i) == b {
2340                        i += 1;
2341                    }
2342                } else {
2343                    last_squeezed = 256;
2344                    *outp.add(wp) = b;
2345                    wp += 1;
2346                    i += 1;
2347                }
2348            }
2349        }
2350        writer.write_all(&outbuf[..wp])?;
2351    }
2352    Ok(())
2353}
2354
2355/// Squeeze a single chunk using bitset membership. Returns squeezed output.
2356fn squeeze_chunk_bitset(chunk: &[u8], member: &[u8; 32]) -> Vec<u8> {
2357    let len = chunk.len();
2358    let mut out = Vec::with_capacity(len);
2359    let mut last_squeezed: u16 = 256;
2360    let mut i = 0;
2361
2362    unsafe {
2363        out.set_len(len);
2364        let inp = chunk.as_ptr();
2365        let outp: *mut u8 = out.as_mut_ptr();
2366        let mut wp = 0;
2367
2368        while i < len {
2369            let b = *inp.add(i);
2370            if is_member(member, b) {
2371                if last_squeezed != b as u16 {
2372                    *outp.add(wp) = b;
2373                    wp += 1;
2374                    last_squeezed = b as u16;
2375                }
2376                i += 1;
2377                while i < len && *inp.add(i) == b {
2378                    i += 1;
2379                }
2380            } else {
2381                last_squeezed = 256;
2382                *outp.add(wp) = b;
2383                wp += 1;
2384                i += 1;
2385            }
2386        }
2387        out.set_len(wp);
2388    }
2389    out
2390}
2391
2392fn squeeze_multi_mmap<const N: usize>(
2393    chars: &[u8],
2394    data: &[u8],
2395    writer: &mut impl Write,
2396) -> io::Result<()> {
2397    // Parallel path for large data: squeeze each chunk, fix boundaries with writev
2398    if data.len() >= PARALLEL_THRESHOLD {
2399        let member = build_member_set(chars);
2400        let n_threads = rayon::current_num_threads().max(1);
2401        let chunk_size = (data.len() / n_threads).max(32 * 1024);
2402
2403        let results: Vec<Vec<u8>> = data
2404            .par_chunks(chunk_size)
2405            .map(|chunk| squeeze_chunk_bitset(chunk, &member))
2406            .collect();
2407
2408        // Build IoSlice list, fixing boundaries
2409        let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
2410        for (idx, result) in results.iter().enumerate() {
2411            if result.is_empty() {
2412                continue;
2413            }
2414            if idx > 0 {
2415                if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
2416                    if is_member(&member, prev_last) {
2417                        let skip = result.iter().take_while(|&&b| b == prev_last).count();
2418                        if skip < result.len() {
2419                            slices.push(std::io::IoSlice::new(&result[skip..]));
2420                        }
2421                        continue;
2422                    }
2423                }
2424            }
2425            slices.push(std::io::IoSlice::new(result));
2426        }
2427        return write_ioslices(writer, &slices);
2428    }
2429
2430    let buf_size = data.len().min(BUF_SIZE);
2431    let mut outbuf = vec![0u8; buf_size];
2432    let mut wp = 0;
2433    let mut last_squeezed: u16 = 256;
2434    let mut cursor = 0;
2435
2436    macro_rules! find_next {
2437        ($data:expr) => {
2438            if N == 2 {
2439                memchr::memchr2(chars[0], chars[1], $data)
2440            } else {
2441                memchr::memchr3(chars[0], chars[1], chars[2], $data)
2442            }
2443        };
2444    }
2445
2446    macro_rules! flush_and_copy {
2447        ($src:expr, $len:expr) => {
2448            if wp + $len > buf_size {
2449                writer.write_all(&outbuf[..wp])?;
2450                wp = 0;
2451            }
2452            if $len > buf_size {
2453                writer.write_all($src)?;
2454            } else {
2455                outbuf[wp..wp + $len].copy_from_slice($src);
2456                wp += $len;
2457            }
2458        };
2459    }
2460
2461    while cursor < data.len() {
2462        match find_next!(&data[cursor..]) {
2463            Some(offset) => {
2464                let pos = cursor + offset;
2465                let b = data[pos];
2466                if pos > cursor {
2467                    let span = pos - cursor;
2468                    flush_and_copy!(&data[cursor..pos], span);
2469                    last_squeezed = 256;
2470                }
2471                if last_squeezed != b as u16 {
2472                    if wp >= buf_size {
2473                        writer.write_all(&outbuf[..wp])?;
2474                        wp = 0;
2475                    }
2476                    outbuf[wp] = b;
2477                    wp += 1;
2478                    last_squeezed = b as u16;
2479                }
2480                let mut skip = pos + 1;
2481                while skip < data.len() && data[skip] == b {
2482                    skip += 1;
2483                }
2484                cursor = skip;
2485            }
2486            None => {
2487                let remaining = data.len() - cursor;
2488                flush_and_copy!(&data[cursor..], remaining);
2489                break;
2490            }
2491        }
2492    }
2493    if wp > 0 {
2494        writer.write_all(&outbuf[..wp])?;
2495    }
2496    Ok(())
2497}
2498
2499fn squeeze_single_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2500    if data.is_empty() {
2501        return Ok(());
2502    }
2503
2504    if memchr::memmem::find(data, &[ch, ch]).is_none() {
2505        return writer.write_all(data);
2506    }
2507
2508    // Parallel path: squeeze each chunk, fix boundaries
2509    if data.len() >= PARALLEL_THRESHOLD {
2510        let n_threads = rayon::current_num_threads().max(1);
2511        let chunk_size = (data.len() / n_threads).max(32 * 1024);
2512
2513        let results: Vec<Vec<u8>> = data
2514            .par_chunks(chunk_size)
2515            .map(|chunk| {
2516                let mut out = Vec::with_capacity(chunk.len());
2517                let mut cursor = 0;
2518                while cursor < chunk.len() {
2519                    match memchr::memchr(ch, &chunk[cursor..]) {
2520                        Some(offset) => {
2521                            let pos = cursor + offset;
2522                            if pos > cursor {
2523                                out.extend_from_slice(&chunk[cursor..pos]);
2524                            }
2525                            out.push(ch);
2526                            cursor = pos + 1;
2527                            while cursor < chunk.len() && chunk[cursor] == ch {
2528                                cursor += 1;
2529                            }
2530                        }
2531                        None => {
2532                            out.extend_from_slice(&chunk[cursor..]);
2533                            break;
2534                        }
2535                    }
2536                }
2537                out
2538            })
2539            .collect();
2540
2541        // Build IoSlice list, fixing boundary squeezability.
2542        // Use writev to minimize syscalls.
2543        let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
2544        for (idx, result) in results.iter().enumerate() {
2545            if result.is_empty() {
2546                continue;
2547            }
2548            if idx > 0 {
2549                if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
2550                    if prev_last == ch {
2551                        // Skip leading ch bytes in this chunk result
2552                        let skip = result.iter().take_while(|&&b| b == ch).count();
2553                        if skip < result.len() {
2554                            slices.push(std::io::IoSlice::new(&result[skip..]));
2555                        }
2556                        continue;
2557                    }
2558                }
2559            }
2560            slices.push(std::io::IoSlice::new(result));
2561        }
2562        return write_ioslices(writer, &slices);
2563    }
2564
2565    let buf_size = data.len().min(BUF_SIZE);
2566    let mut outbuf = vec![0u8; buf_size];
2567    let len = data.len();
2568    let mut wp = 0;
2569    let mut cursor = 0;
2570
2571    while cursor < len {
2572        match memchr::memchr(ch, &data[cursor..]) {
2573            Some(offset) => {
2574                let pos = cursor + offset;
2575                let gap = pos - cursor;
2576                if gap > 0 {
2577                    if wp + gap > buf_size {
2578                        writer.write_all(&outbuf[..wp])?;
2579                        wp = 0;
2580                    }
2581                    if gap > buf_size {
2582                        writer.write_all(&data[cursor..pos])?;
2583                    } else {
2584                        outbuf[wp..wp + gap].copy_from_slice(&data[cursor..pos]);
2585                        wp += gap;
2586                    }
2587                }
2588                if wp >= buf_size {
2589                    writer.write_all(&outbuf[..wp])?;
2590                    wp = 0;
2591                }
2592                outbuf[wp] = ch;
2593                wp += 1;
2594                cursor = pos + 1;
2595                while cursor < len && data[cursor] == ch {
2596                    cursor += 1;
2597                }
2598            }
2599            None => {
2600                let remaining = len - cursor;
2601                if remaining > 0 {
2602                    if wp + remaining > buf_size {
2603                        writer.write_all(&outbuf[..wp])?;
2604                        wp = 0;
2605                    }
2606                    if remaining > buf_size {
2607                        writer.write_all(&data[cursor..])?;
2608                    } else {
2609                        outbuf[wp..wp + remaining].copy_from_slice(&data[cursor..]);
2610                        wp += remaining;
2611                    }
2612                }
2613                break;
2614            }
2615        }
2616    }
2617
2618    if wp > 0 {
2619        writer.write_all(&outbuf[..wp])?;
2620    }
2621    Ok(())
2622}