Skip to main content

coreutils_rs/tr/
core.rs

1use std::io::{self, Read, Write};
2
3use rayon::prelude::*;
4
5/// Maximum IoSlice entries per write_vectored batch.
6/// Linux UIO_MAXIOV is 1024; we use that as our batch limit.
7const MAX_IOV: usize = 1024;
8
9/// Main processing buffer: 4MB (fits in L3 cache, avoids cache thrashing).
10const BUF_SIZE: usize = 4 * 1024 * 1024;
11
12/// Stream buffer: 16MB — tr streaming operations (translate, delete, squeeze)
13/// are compute-light (single table lookup or bitset check per byte), so the
14/// bottleneck is I/O syscalls, not cache pressure. 16MB buffer means only
15/// 1 read()/write() syscall pair for a 10MB input, minimizing syscall overhead.
16/// For piped input, read_full retries partial reads to fill the entire buffer.
17/// This applies to ALL streaming modes (delete, squeeze, translate).
18const STREAM_BUF: usize = 16 * 1024 * 1024;
19
20/// Minimum data size to engage rayon parallel processing for mmap paths.
21/// Below this, single-threaded is faster due to thread pool overhead.
22const PARALLEL_THRESHOLD: usize = 2 * 1024 * 1024;
23
24/// Write multiple IoSlice buffers using write_vectored, batching into MAX_IOV-sized groups.
25/// Falls back to write_all per slice for partial writes.
26#[inline]
27fn write_ioslices(writer: &mut impl Write, slices: &[std::io::IoSlice]) -> io::Result<()> {
28    if slices.is_empty() {
29        return Ok(());
30    }
31    for batch in slices.chunks(MAX_IOV) {
32        let total: usize = batch.iter().map(|s| s.len()).sum();
33        match writer.write_vectored(batch) {
34            Ok(n) if n >= total => continue,
35            Ok(mut written) => {
36                // Partial write: fall back to write_all per remaining slice
37                for slice in batch {
38                    let slen = slice.len();
39                    if written >= slen {
40                        written -= slen;
41                        continue;
42                    }
43                    if written > 0 {
44                        writer.write_all(&slice[written..])?;
45                        written = 0;
46                    } else {
47                        writer.write_all(slice)?;
48                    }
49                }
50            }
51            Err(e) => return Err(e),
52        }
53    }
54    Ok(())
55}
56
57/// Allocate a Vec<u8> of given length without zero-initialization.
58/// SAFETY: Caller must write all bytes before reading them.
59#[inline]
60#[allow(clippy::uninit_vec)]
61fn alloc_uninit_vec(len: usize) -> Vec<u8> {
62    let mut v = Vec::with_capacity(len);
63    // SAFETY: u8 has no drop, no invalid bit patterns; caller will overwrite before reading
64    unsafe {
65        v.set_len(len);
66    }
67    v
68}
69
70/// Build a 256-byte lookup table mapping set1[i] -> set2[i].
71#[inline]
72fn build_translate_table(set1: &[u8], set2: &[u8]) -> [u8; 256] {
73    let mut table: [u8; 256] = std::array::from_fn(|i| i as u8);
74    let last = set2.last().copied();
75    for (i, &from) in set1.iter().enumerate() {
76        table[from as usize] = if i < set2.len() {
77            set2[i]
78        } else {
79            last.unwrap_or(from)
80        };
81    }
82    table
83}
84
85/// Build a 256-bit (32-byte) membership set for O(1) byte lookup.
86#[inline]
87fn build_member_set(chars: &[u8]) -> [u8; 32] {
88    let mut set = [0u8; 32];
89    for &ch in chars {
90        set[ch as usize >> 3] |= 1 << (ch & 7);
91    }
92    set
93}
94
95#[inline(always)]
96fn is_member(set: &[u8; 32], ch: u8) -> bool {
97    unsafe { (*set.get_unchecked(ch as usize >> 3) & (1 << (ch & 7))) != 0 }
98}
99
100/// Translate bytes in-place using a 256-byte lookup table.
101/// On x86_64 with SSSE3+, uses SIMD pshufb-based nibble decomposition for
102/// ~32 bytes per iteration. Falls back to 8x-unrolled scalar on other platforms.
103#[inline(always)]
104fn translate_inplace(data: &mut [u8], table: &[u8; 256]) {
105    #[cfg(target_arch = "x86_64")]
106    {
107        if is_x86_feature_detected!("avx2") {
108            unsafe { translate_inplace_avx2_table(data, table) };
109            return;
110        }
111        if is_x86_feature_detected!("ssse3") {
112            unsafe { translate_inplace_ssse3_table(data, table) };
113            return;
114        }
115    }
116    translate_inplace_scalar(data, table);
117}
118
119/// Scalar fallback: 8x-unrolled table lookup.
120#[inline(always)]
121fn translate_inplace_scalar(data: &mut [u8], table: &[u8; 256]) {
122    let len = data.len();
123    let ptr = data.as_mut_ptr();
124    let mut i = 0;
125    unsafe {
126        while i + 8 <= len {
127            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
128            *ptr.add(i + 1) = *table.get_unchecked(*ptr.add(i + 1) as usize);
129            *ptr.add(i + 2) = *table.get_unchecked(*ptr.add(i + 2) as usize);
130            *ptr.add(i + 3) = *table.get_unchecked(*ptr.add(i + 3) as usize);
131            *ptr.add(i + 4) = *table.get_unchecked(*ptr.add(i + 4) as usize);
132            *ptr.add(i + 5) = *table.get_unchecked(*ptr.add(i + 5) as usize);
133            *ptr.add(i + 6) = *table.get_unchecked(*ptr.add(i + 6) as usize);
134            *ptr.add(i + 7) = *table.get_unchecked(*ptr.add(i + 7) as usize);
135            i += 8;
136        }
137        while i < len {
138            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
139            i += 1;
140        }
141    }
142}
143
144// ============================================================================
145// SIMD arbitrary table lookup using pshufb nibble decomposition (x86_64)
146// ============================================================================
147//
148// For an arbitrary 256-byte lookup table, we decompose each byte into
149// high nibble (bits 7-4) and low nibble (bits 3-0). We pre-build 16
150// SIMD vectors, one for each high nibble value h (0..15), containing
151// the 16 table entries table[h*16+0..h*16+15]. Then for each input
152// vector we:
153//   1. Extract low nibble (AND 0x0F) -> used as pshufb index
154//   2. Extract high nibble (shift right 4) -> used to select which table
155//   3. For each of the 16 high nibble values, create a mask where
156//      the high nibble equals that value, pshufb the corresponding
157//      table, and accumulate results
158//
159// AVX2 processes 32 bytes/iteration; SSSE3 processes 16 bytes/iteration.
160// With instruction-level parallelism, this achieves much higher throughput
161// than scalar table lookups which have serial data dependencies.
162
163#[cfg(target_arch = "x86_64")]
164#[target_feature(enable = "avx2")]
165unsafe fn translate_inplace_avx2_table(data: &mut [u8], table: &[u8; 256]) {
166    use std::arch::x86_64::*;
167
168    unsafe {
169        let len = data.len();
170        let ptr = data.as_mut_ptr();
171
172        // Pre-build 16 lookup vectors, one per high nibble value.
173        // Each vector holds 32 bytes = 2x 128-bit lanes, each lane has the same
174        // 16 table entries for pshufb indexing by low nibble.
175        let mut lut = [_mm256_setzero_si256(); 16];
176        for h in 0u8..16 {
177            let base = (h as usize) * 16;
178            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
179            // Broadcast the 128-bit row to both lanes of the 256-bit vector
180            let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
181            lut[h as usize] = _mm256_broadcastsi128_si256(row128);
182        }
183
184        let lo_mask = _mm256_set1_epi8(0x0F);
185
186        let mut i = 0;
187        while i + 32 <= len {
188            let input = _mm256_loadu_si256(ptr.add(i) as *const _);
189            let lo_nibble = _mm256_and_si256(input, lo_mask);
190            let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
191
192            // Accumulate result: for each high nibble value h, select bytes where
193            // hi_nibble == h, look up in lut[h] using lo_nibble, blend into result.
194            let mut result = _mm256_setzero_si256();
195
196            // Unroll the 16 high-nibble iterations for best ILP
197            macro_rules! do_nibble {
198                ($h:expr) => {
199                    let h_val = _mm256_set1_epi8($h as i8);
200                    let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
201                    let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
202                    result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
203                };
204            }
205            do_nibble!(0);
206            do_nibble!(1);
207            do_nibble!(2);
208            do_nibble!(3);
209            do_nibble!(4);
210            do_nibble!(5);
211            do_nibble!(6);
212            do_nibble!(7);
213            do_nibble!(8);
214            do_nibble!(9);
215            do_nibble!(10);
216            do_nibble!(11);
217            do_nibble!(12);
218            do_nibble!(13);
219            do_nibble!(14);
220            do_nibble!(15);
221
222            _mm256_storeu_si256(ptr.add(i) as *mut _, result);
223            i += 32;
224        }
225
226        // SSE/SSSE3 tail for remaining 16-byte chunk
227        if i + 16 <= len {
228            let lo_mask128 = _mm_set1_epi8(0x0F);
229
230            // Build 128-bit LUTs (just the lower lane of each 256-bit LUT)
231            let mut lut128 = [_mm_setzero_si128(); 16];
232            for h in 0u8..16 {
233                lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
234            }
235
236            let input = _mm_loadu_si128(ptr.add(i) as *const _);
237            let lo_nib = _mm_and_si128(input, lo_mask128);
238            let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
239
240            let mut res = _mm_setzero_si128();
241            macro_rules! do_nibble128 {
242                ($h:expr) => {
243                    let h_val = _mm_set1_epi8($h as i8);
244                    let mask = _mm_cmpeq_epi8(hi_nib, h_val);
245                    let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
246                    res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
247                };
248            }
249            do_nibble128!(0);
250            do_nibble128!(1);
251            do_nibble128!(2);
252            do_nibble128!(3);
253            do_nibble128!(4);
254            do_nibble128!(5);
255            do_nibble128!(6);
256            do_nibble128!(7);
257            do_nibble128!(8);
258            do_nibble128!(9);
259            do_nibble128!(10);
260            do_nibble128!(11);
261            do_nibble128!(12);
262            do_nibble128!(13);
263            do_nibble128!(14);
264            do_nibble128!(15);
265
266            _mm_storeu_si128(ptr.add(i) as *mut _, res);
267            i += 16;
268        }
269
270        // Scalar tail
271        while i < len {
272            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
273            i += 1;
274        }
275    }
276}
277
278#[cfg(target_arch = "x86_64")]
279#[target_feature(enable = "ssse3")]
280unsafe fn translate_inplace_ssse3_table(data: &mut [u8], table: &[u8; 256]) {
281    use std::arch::x86_64::*;
282
283    unsafe {
284        let len = data.len();
285        let ptr = data.as_mut_ptr();
286
287        // Pre-build 16 lookup vectors for pshufb
288        let mut lut = [_mm_setzero_si128(); 16];
289        for h in 0u8..16 {
290            let base = (h as usize) * 16;
291            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
292            lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
293        }
294
295        let lo_mask = _mm_set1_epi8(0x0F);
296
297        let mut i = 0;
298        while i + 16 <= len {
299            let input = _mm_loadu_si128(ptr.add(i) as *const _);
300            let lo_nibble = _mm_and_si128(input, lo_mask);
301            let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
302
303            let mut result = _mm_setzero_si128();
304
305            macro_rules! do_nibble {
306                ($h:expr) => {
307                    let h_val = _mm_set1_epi8($h as i8);
308                    let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
309                    let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
310                    result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
311                };
312            }
313            do_nibble!(0);
314            do_nibble!(1);
315            do_nibble!(2);
316            do_nibble!(3);
317            do_nibble!(4);
318            do_nibble!(5);
319            do_nibble!(6);
320            do_nibble!(7);
321            do_nibble!(8);
322            do_nibble!(9);
323            do_nibble!(10);
324            do_nibble!(11);
325            do_nibble!(12);
326            do_nibble!(13);
327            do_nibble!(14);
328            do_nibble!(15);
329
330            _mm_storeu_si128(ptr.add(i) as *mut _, result);
331            i += 16;
332        }
333
334        // Scalar tail
335        while i < len {
336            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
337            i += 1;
338        }
339    }
340}
341
342/// Translate bytes from source to destination using a 256-byte lookup table.
343/// On x86_64 with SSSE3+, uses SIMD pshufb-based nibble decomposition.
344#[inline(always)]
345fn translate_to(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
346    debug_assert!(dst.len() >= src.len());
347    #[cfg(target_arch = "x86_64")]
348    {
349        if is_x86_feature_detected!("avx2") {
350            unsafe { translate_to_avx2_table(src, dst, table) };
351            return;
352        }
353        if is_x86_feature_detected!("ssse3") {
354            unsafe { translate_to_ssse3_table(src, dst, table) };
355            return;
356        }
357    }
358    translate_to_scalar(src, dst, table);
359}
360
361/// Scalar fallback for translate_to.
362#[inline(always)]
363fn translate_to_scalar(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
364    unsafe {
365        let sp = src.as_ptr();
366        let dp = dst.as_mut_ptr();
367        let len = src.len();
368        let mut i = 0;
369        while i + 8 <= len {
370            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
371            *dp.add(i + 1) = *table.get_unchecked(*sp.add(i + 1) as usize);
372            *dp.add(i + 2) = *table.get_unchecked(*sp.add(i + 2) as usize);
373            *dp.add(i + 3) = *table.get_unchecked(*sp.add(i + 3) as usize);
374            *dp.add(i + 4) = *table.get_unchecked(*sp.add(i + 4) as usize);
375            *dp.add(i + 5) = *table.get_unchecked(*sp.add(i + 5) as usize);
376            *dp.add(i + 6) = *table.get_unchecked(*sp.add(i + 6) as usize);
377            *dp.add(i + 7) = *table.get_unchecked(*sp.add(i + 7) as usize);
378            i += 8;
379        }
380        while i < len {
381            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
382            i += 1;
383        }
384    }
385}
386
387#[cfg(target_arch = "x86_64")]
388#[target_feature(enable = "avx2")]
389unsafe fn translate_to_avx2_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
390    use std::arch::x86_64::*;
391
392    unsafe {
393        let len = src.len();
394        let sp = src.as_ptr();
395        let dp = dst.as_mut_ptr();
396
397        // Pre-build 16 lookup vectors
398        let mut lut = [_mm256_setzero_si256(); 16];
399        for h in 0u8..16 {
400            let base = (h as usize) * 16;
401            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
402            let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
403            lut[h as usize] = _mm256_broadcastsi128_si256(row128);
404        }
405
406        let lo_mask = _mm256_set1_epi8(0x0F);
407
408        let mut i = 0;
409        while i + 32 <= len {
410            let input = _mm256_loadu_si256(sp.add(i) as *const _);
411            let lo_nibble = _mm256_and_si256(input, lo_mask);
412            let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
413
414            let mut result = _mm256_setzero_si256();
415
416            macro_rules! do_nibble {
417                ($h:expr) => {
418                    let h_val = _mm256_set1_epi8($h as i8);
419                    let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
420                    let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
421                    result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
422                };
423            }
424            do_nibble!(0);
425            do_nibble!(1);
426            do_nibble!(2);
427            do_nibble!(3);
428            do_nibble!(4);
429            do_nibble!(5);
430            do_nibble!(6);
431            do_nibble!(7);
432            do_nibble!(8);
433            do_nibble!(9);
434            do_nibble!(10);
435            do_nibble!(11);
436            do_nibble!(12);
437            do_nibble!(13);
438            do_nibble!(14);
439            do_nibble!(15);
440
441            _mm256_storeu_si256(dp.add(i) as *mut _, result);
442            i += 32;
443        }
444
445        // SSSE3 tail for remaining 16-byte chunk
446        if i + 16 <= len {
447            let lo_mask128 = _mm_set1_epi8(0x0F);
448            let mut lut128 = [_mm_setzero_si128(); 16];
449            for h in 0u8..16 {
450                lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
451            }
452
453            let input = _mm_loadu_si128(sp.add(i) as *const _);
454            let lo_nib = _mm_and_si128(input, lo_mask128);
455            let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
456
457            let mut res = _mm_setzero_si128();
458            macro_rules! do_nibble128 {
459                ($h:expr) => {
460                    let h_val = _mm_set1_epi8($h as i8);
461                    let mask = _mm_cmpeq_epi8(hi_nib, h_val);
462                    let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
463                    res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
464                };
465            }
466            do_nibble128!(0);
467            do_nibble128!(1);
468            do_nibble128!(2);
469            do_nibble128!(3);
470            do_nibble128!(4);
471            do_nibble128!(5);
472            do_nibble128!(6);
473            do_nibble128!(7);
474            do_nibble128!(8);
475            do_nibble128!(9);
476            do_nibble128!(10);
477            do_nibble128!(11);
478            do_nibble128!(12);
479            do_nibble128!(13);
480            do_nibble128!(14);
481            do_nibble128!(15);
482
483            _mm_storeu_si128(dp.add(i) as *mut _, res);
484            i += 16;
485        }
486
487        // Scalar tail
488        while i < len {
489            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
490            i += 1;
491        }
492    }
493}
494
495#[cfg(target_arch = "x86_64")]
496#[target_feature(enable = "ssse3")]
497unsafe fn translate_to_ssse3_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
498    use std::arch::x86_64::*;
499
500    unsafe {
501        let len = src.len();
502        let sp = src.as_ptr();
503        let dp = dst.as_mut_ptr();
504
505        let mut lut = [_mm_setzero_si128(); 16];
506        for h in 0u8..16 {
507            let base = (h as usize) * 16;
508            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
509            lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
510        }
511
512        let lo_mask = _mm_set1_epi8(0x0F);
513
514        let mut i = 0;
515        while i + 16 <= len {
516            let input = _mm_loadu_si128(sp.add(i) as *const _);
517            let lo_nibble = _mm_and_si128(input, lo_mask);
518            let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
519
520            let mut result = _mm_setzero_si128();
521
522            macro_rules! do_nibble {
523                ($h:expr) => {
524                    let h_val = _mm_set1_epi8($h as i8);
525                    let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
526                    let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
527                    result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
528                };
529            }
530            do_nibble!(0);
531            do_nibble!(1);
532            do_nibble!(2);
533            do_nibble!(3);
534            do_nibble!(4);
535            do_nibble!(5);
536            do_nibble!(6);
537            do_nibble!(7);
538            do_nibble!(8);
539            do_nibble!(9);
540            do_nibble!(10);
541            do_nibble!(11);
542            do_nibble!(12);
543            do_nibble!(13);
544            do_nibble!(14);
545            do_nibble!(15);
546
547            _mm_storeu_si128(dp.add(i) as *mut _, result);
548            i += 16;
549        }
550
551        // Scalar tail
552        while i < len {
553            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
554            i += 1;
555        }
556    }
557}
558
559// ============================================================================
560// SIMD range translation (x86_64)
561// ============================================================================
562
563/// Detect if the translate table is a single contiguous range with constant offset.
564/// Returns Some((lo, hi, offset)) if all non-identity entries form [lo..=hi] with
565/// table[i] = i + offset for all i in [lo, hi].
566#[inline]
567fn detect_range_offset(table: &[u8; 256]) -> Option<(u8, u8, i8)> {
568    let mut lo: Option<u8> = None;
569    let mut hi = 0u8;
570    let mut offset = 0i16;
571
572    for i in 0..256 {
573        if table[i] != i as u8 {
574            let diff = table[i] as i16 - i as i16;
575            match lo {
576                None => {
577                    lo = Some(i as u8);
578                    hi = i as u8;
579                    offset = diff;
580                }
581                Some(_) => {
582                    if diff != offset || i as u8 != hi.wrapping_add(1) {
583                        return None;
584                    }
585                    hi = i as u8;
586                }
587            }
588        }
589    }
590
591    lo.map(|l| (l, hi, offset as i8))
592}
593
594/// SIMD-accelerated range translation for mmap'd data.
595/// For tables where only a contiguous range [lo..=hi] is translated by a constant offset,
596/// uses AVX2 (32 bytes/iter) or SSE2 (16 bytes/iter) vectorized arithmetic.
597#[cfg(target_arch = "x86_64")]
598fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
599    if is_x86_feature_detected!("avx2") {
600        unsafe { translate_range_avx2(src, dst, lo, hi, offset) };
601    } else {
602        unsafe { translate_range_sse2(src, dst, lo, hi, offset) };
603    }
604}
605
606#[cfg(target_arch = "x86_64")]
607#[target_feature(enable = "avx2")]
608unsafe fn translate_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
609    use std::arch::x86_64::*;
610
611    unsafe {
612        let range = hi - lo;
613        // Bias: shift range so lo maps to -128 (signed min).
614        // For input in [lo, hi]: biased = input + (0x80 - lo) is in [-128, -128+range].
615        // For input < lo: biased wraps to large positive (signed), > threshold.
616        // For input > hi: biased > -128+range, > threshold.
617        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
618        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
619        let offset_v = _mm256_set1_epi8(offset);
620        let zero = _mm256_setzero_si256();
621
622        let len = src.len();
623        let mut i = 0;
624
625        while i + 32 <= len {
626            let input = _mm256_loadu_si256(src.as_ptr().add(i) as *const _);
627            let biased = _mm256_add_epi8(input, bias_v);
628            // gt = 0xFF where biased > threshold (OUT of range)
629            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
630            // mask = 0xFF where IN range (NOT gt)
631            let mask = _mm256_cmpeq_epi8(gt, zero);
632            let offset_masked = _mm256_and_si256(mask, offset_v);
633            let result = _mm256_add_epi8(input, offset_masked);
634            _mm256_storeu_si256(dst.as_mut_ptr().add(i) as *mut _, result);
635            i += 32;
636        }
637
638        // SSE2 tail for 16-byte remainder
639        if i + 16 <= len {
640            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
641            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
642            let offset_v128 = _mm_set1_epi8(offset);
643            let zero128 = _mm_setzero_si128();
644
645            let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
646            let biased = _mm_add_epi8(input, bias_v128);
647            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
648            let mask = _mm_cmpeq_epi8(gt, zero128);
649            let offset_masked = _mm_and_si128(mask, offset_v128);
650            let result = _mm_add_epi8(input, offset_masked);
651            _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
652            i += 16;
653        }
654
655        // Scalar tail
656        while i < len {
657            let b = *src.get_unchecked(i);
658            *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
659                b.wrapping_add(offset as u8)
660            } else {
661                b
662            };
663            i += 1;
664        }
665    }
666}
667
668#[cfg(target_arch = "x86_64")]
669#[target_feature(enable = "sse2")]
670unsafe fn translate_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
671    use std::arch::x86_64::*;
672
673    unsafe {
674        let range = hi - lo;
675        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
676        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
677        let offset_v = _mm_set1_epi8(offset);
678        let zero = _mm_setzero_si128();
679
680        let len = src.len();
681        let mut i = 0;
682
683        while i + 16 <= len {
684            let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
685            let biased = _mm_add_epi8(input, bias_v);
686            let gt = _mm_cmpgt_epi8(biased, threshold_v);
687            let mask = _mm_cmpeq_epi8(gt, zero);
688            let offset_masked = _mm_and_si128(mask, offset_v);
689            let result = _mm_add_epi8(input, offset_masked);
690            _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
691            i += 16;
692        }
693
694        while i < len {
695            let b = *src.get_unchecked(i);
696            *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
697                b.wrapping_add(offset as u8)
698            } else {
699                b
700            };
701            i += 1;
702        }
703    }
704}
705
706/// Scalar range translation fallback for non-x86_64.
707#[cfg(not(target_arch = "x86_64"))]
708fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
709    for (i, &b) in src.iter().enumerate() {
710        dst[i] = if b >= lo && b <= hi {
711            b.wrapping_add(offset as u8)
712        } else {
713            b
714        };
715    }
716}
717
718// ============================================================================
719// In-place SIMD range translation (saves one buffer allocation in streaming)
720// ============================================================================
721
722/// In-place SIMD-accelerated range translation.
723/// Translates bytes in [lo..=hi] by adding `offset`, leaving others unchanged.
724/// Operates on the buffer in-place, eliminating the need for a separate output buffer.
725#[cfg(target_arch = "x86_64")]
726fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
727    if is_x86_feature_detected!("avx2") {
728        unsafe { translate_range_avx2_inplace(data, lo, hi, offset) };
729    } else {
730        unsafe { translate_range_sse2_inplace(data, lo, hi, offset) };
731    }
732}
733
734#[cfg(target_arch = "x86_64")]
735#[target_feature(enable = "avx2")]
736unsafe fn translate_range_avx2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
737    use std::arch::x86_64::*;
738
739    unsafe {
740        let range = hi - lo;
741        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
742        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
743        let offset_v = _mm256_set1_epi8(offset);
744        let zero = _mm256_setzero_si256();
745
746        let len = data.len();
747        let ptr = data.as_mut_ptr();
748        let mut i = 0;
749
750        while i + 32 <= len {
751            let input = _mm256_loadu_si256(ptr.add(i) as *const _);
752            let biased = _mm256_add_epi8(input, bias_v);
753            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
754            let mask = _mm256_cmpeq_epi8(gt, zero);
755            let offset_masked = _mm256_and_si256(mask, offset_v);
756            let result = _mm256_add_epi8(input, offset_masked);
757            _mm256_storeu_si256(ptr.add(i) as *mut _, result);
758            i += 32;
759        }
760
761        if i + 16 <= len {
762            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
763            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
764            let offset_v128 = _mm_set1_epi8(offset);
765            let zero128 = _mm_setzero_si128();
766
767            let input = _mm_loadu_si128(ptr.add(i) as *const _);
768            let biased = _mm_add_epi8(input, bias_v128);
769            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
770            let mask = _mm_cmpeq_epi8(gt, zero128);
771            let offset_masked = _mm_and_si128(mask, offset_v128);
772            let result = _mm_add_epi8(input, offset_masked);
773            _mm_storeu_si128(ptr.add(i) as *mut _, result);
774            i += 16;
775        }
776
777        while i < len {
778            let b = *ptr.add(i);
779            *ptr.add(i) = if b >= lo && b <= hi {
780                b.wrapping_add(offset as u8)
781            } else {
782                b
783            };
784            i += 1;
785        }
786    }
787}
788
789#[cfg(target_arch = "x86_64")]
790#[target_feature(enable = "sse2")]
791unsafe fn translate_range_sse2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
792    use std::arch::x86_64::*;
793
794    unsafe {
795        let range = hi - lo;
796        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
797        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
798        let offset_v = _mm_set1_epi8(offset);
799        let zero = _mm_setzero_si128();
800
801        let len = data.len();
802        let ptr = data.as_mut_ptr();
803        let mut i = 0;
804
805        while i + 16 <= len {
806            let input = _mm_loadu_si128(ptr.add(i) as *const _);
807            let biased = _mm_add_epi8(input, bias_v);
808            let gt = _mm_cmpgt_epi8(biased, threshold_v);
809            let mask = _mm_cmpeq_epi8(gt, zero);
810            let offset_masked = _mm_and_si128(mask, offset_v);
811            let result = _mm_add_epi8(input, offset_masked);
812            _mm_storeu_si128(ptr.add(i) as *mut _, result);
813            i += 16;
814        }
815
816        while i < len {
817            let b = *ptr.add(i);
818            *ptr.add(i) = if b >= lo && b <= hi {
819                b.wrapping_add(offset as u8)
820            } else {
821                b
822            };
823            i += 1;
824        }
825    }
826}
827
828#[cfg(not(target_arch = "x86_64"))]
829fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
830    for b in data.iter_mut() {
831        if *b >= lo && *b <= hi {
832            *b = b.wrapping_add(offset as u8);
833        }
834    }
835}
836
837// ============================================================================
838// SIMD range deletion (x86_64)
839// ============================================================================
840
841/// Detect if ALL delete characters form a single contiguous byte range [lo..=hi].
842/// Returns Some((lo, hi)) if so. This is true for common classes:
843/// - `[:digit:]` = 0x30..=0x39
844/// - `a-z` = 0x61..=0x7A
845/// - `A-Z` = 0x41..=0x5A
846#[inline]
847fn detect_delete_range(chars: &[u8]) -> Option<(u8, u8)> {
848    if chars.is_empty() {
849        return None;
850    }
851    let mut lo = chars[0];
852    let mut hi = chars[0];
853    for &c in &chars[1..] {
854        if c < lo {
855            lo = c;
856        }
857        if c > hi {
858            hi = c;
859        }
860    }
861    // Check that the range size matches the number of chars (no gaps)
862    if (hi - lo + 1) as usize == chars.len() {
863        Some((lo, hi))
864    } else {
865        None
866    }
867}
868
869/// SIMD-accelerated delete for contiguous byte ranges.
870/// Uses the same bias+threshold trick as range translate to identify bytes in [lo..=hi],
871/// then compacts output by skipping matched bytes.
872#[cfg(target_arch = "x86_64")]
873fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
874    if is_x86_feature_detected!("avx2") {
875        unsafe { delete_range_avx2(src, dst, lo, hi) }
876    } else {
877        unsafe { delete_range_sse2(src, dst, lo, hi) }
878    }
879}
880
881#[cfg(target_arch = "x86_64")]
882#[target_feature(enable = "avx2")]
883unsafe fn delete_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
884    use std::arch::x86_64::*;
885
886    unsafe {
887        let range = hi - lo;
888        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
889        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
890        let zero = _mm256_setzero_si256();
891
892        let len = src.len();
893        let sp = src.as_ptr();
894        let dp = dst.as_mut_ptr();
895        let mut ri = 0;
896        let mut wp = 0;
897
898        while ri + 32 <= len {
899            let input = _mm256_loadu_si256(sp.add(ri) as *const _);
900            let biased = _mm256_add_epi8(input, bias_v);
901            // gt = 0xFF where biased > threshold (OUT of range = KEEP)
902            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
903            // in_range = 0xFF where IN range (to DELETE), 0 where to KEEP
904            let in_range = _mm256_cmpeq_epi8(gt, zero);
905            // keep_mask bits: 1 = keep (NOT in range)
906            let keep_mask = !(_mm256_movemask_epi8(in_range) as u32);
907
908            if keep_mask == 0xFFFFFFFF {
909                // All 32 bytes are kept — bulk copy
910                std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 32);
911                wp += 32;
912            } else if keep_mask != 0 {
913                // Partial keep — process each 8-byte lane with popcnt
914                compact_8bytes(sp.add(ri), dp.add(wp), keep_mask as u8);
915                let c0 = (keep_mask as u8).count_ones() as usize;
916                compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), (keep_mask >> 8) as u8);
917                let c1 = ((keep_mask >> 8) as u8).count_ones() as usize;
918                compact_8bytes(
919                    sp.add(ri + 16),
920                    dp.add(wp + c0 + c1),
921                    (keep_mask >> 16) as u8,
922                );
923                let c2 = ((keep_mask >> 16) as u8).count_ones() as usize;
924                compact_8bytes(
925                    sp.add(ri + 24),
926                    dp.add(wp + c0 + c1 + c2),
927                    (keep_mask >> 24) as u8,
928                );
929                let c3 = ((keep_mask >> 24) as u8).count_ones() as usize;
930                wp += c0 + c1 + c2 + c3;
931            }
932            // else: keep_mask == 0 means all bytes deleted, skip entirely
933            ri += 32;
934        }
935
936        // SSE2 tail for 16-byte remainder
937        if ri + 16 <= len {
938            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
939            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
940            let zero128 = _mm_setzero_si128();
941
942            let input = _mm_loadu_si128(sp.add(ri) as *const _);
943            let biased = _mm_add_epi8(input, bias_v128);
944            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
945            let in_range = _mm_cmpeq_epi8(gt, zero128);
946            let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
947
948            if keep_mask == 0xFFFF {
949                std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 16);
950                wp += 16;
951            } else if keep_mask != 0 {
952                compact_8bytes(sp.add(ri), dp.add(wp), keep_mask as u8);
953                let c0 = (keep_mask as u8).count_ones() as usize;
954                compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), (keep_mask >> 8) as u8);
955                wp += c0 + ((keep_mask >> 8) as u8).count_ones() as usize;
956            }
957            ri += 16;
958        }
959
960        // Scalar tail
961        while ri < len {
962            let b = *sp.add(ri);
963            if b < lo || b > hi {
964                *dp.add(wp) = b;
965                wp += 1;
966            }
967            ri += 1;
968        }
969
970        wp
971    }
972}
973
974/// Compact 8 source bytes into contiguous output bytes using a keep mask.
975/// Each bit in `mask` indicates whether the corresponding byte should be kept.
976/// Uses branchless writes: always writes 8 bytes (the extras are harmless since
977/// the caller tracks the write pointer by popcount).
978#[cfg(target_arch = "x86_64")]
979#[inline(always)]
980unsafe fn compact_8bytes(src: *const u8, dst: *mut u8, mask: u8) {
981    // For each set bit position in the mask, copy that byte from src to the
982    // next output position. Using a tight trailing_zeros loop is faster than
983    // a lookup table for 8-bit masks because it's branch-predicted well
984    // and avoids cache misses on the table.
985    unsafe {
986        let mut m = mask;
987        let mut w = 0;
988        while m != 0 {
989            let bit = m.trailing_zeros() as usize;
990            *dst.add(w) = *src.add(bit);
991            w += 1;
992            m &= m - 1;
993        }
994    }
995}
996
997#[cfg(target_arch = "x86_64")]
998#[target_feature(enable = "sse2")]
999unsafe fn delete_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
1000    use std::arch::x86_64::*;
1001
1002    unsafe {
1003        let range = hi - lo;
1004        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1005        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1006        let zero = _mm_setzero_si128();
1007
1008        let len = src.len();
1009        let sp = src.as_ptr();
1010        let dp = dst.as_mut_ptr();
1011        let mut ri = 0;
1012        let mut wp = 0;
1013
1014        while ri + 16 <= len {
1015            let input = _mm_loadu_si128(sp.add(ri) as *const _);
1016            let biased = _mm_add_epi8(input, bias_v);
1017            let gt = _mm_cmpgt_epi8(biased, threshold_v);
1018            let in_range = _mm_cmpeq_epi8(gt, zero);
1019            let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
1020
1021            if keep_mask == 0xFFFF {
1022                // All 16 bytes kept — bulk copy
1023                std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 16);
1024                wp += 16;
1025            } else if keep_mask != 0 {
1026                compact_8bytes(sp.add(ri), dp.add(wp), keep_mask as u8);
1027                let c0 = (keep_mask as u8).count_ones() as usize;
1028                compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), (keep_mask >> 8) as u8);
1029                wp += c0 + ((keep_mask >> 8) as u8).count_ones() as usize;
1030            }
1031            ri += 16;
1032        }
1033
1034        while ri < len {
1035            let b = *sp.add(ri);
1036            if b < lo || b > hi {
1037                *dp.add(wp) = b;
1038                wp += 1;
1039            }
1040            ri += 1;
1041        }
1042
1043        wp
1044    }
1045}
1046
1047/// Scalar range delete fallback for non-x86_64.
1048#[cfg(not(target_arch = "x86_64"))]
1049fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
1050    let mut wp = 0;
1051    for &b in src {
1052        if b < lo || b > hi {
1053            dst[wp] = b;
1054            wp += 1;
1055        }
1056    }
1057    wp
1058}
1059
1060/// Streaming delete for contiguous byte ranges using SIMD range detection.
1061/// Uses 4MB buffer to reduce syscalls (delete is compute-light, I/O bound).
1062/// When no bytes are deleted from a chunk (common for data with few matches),
1063/// writes directly from the source buffer to avoid the copy overhead.
1064fn delete_range_streaming(
1065    lo: u8,
1066    hi: u8,
1067    reader: &mut impl Read,
1068    writer: &mut impl Write,
1069) -> io::Result<()> {
1070    let mut src = vec![0u8; STREAM_BUF];
1071    let mut dst = alloc_uninit_vec(STREAM_BUF);
1072    loop {
1073        let n = read_full(reader, &mut src)?;
1074        if n == 0 {
1075            break;
1076        }
1077        let wp = delete_range_chunk(&src[..n], &mut dst, lo, hi);
1078        if wp == n {
1079            // No bytes deleted — write source directly (avoids copy overhead)
1080            writer.write_all(&src[..n])?;
1081        } else if wp > 0 {
1082            writer.write_all(&dst[..wp])?;
1083        }
1084    }
1085    Ok(())
1086}
1087
1088// ============================================================================
1089// Streaming functions (Read + Write)
1090// ============================================================================
1091
1092pub fn translate(
1093    set1: &[u8],
1094    set2: &[u8],
1095    reader: &mut impl Read,
1096    writer: &mut impl Write,
1097) -> io::Result<()> {
1098    let table = build_translate_table(set1, set2);
1099
1100    // Check for identity table — pure passthrough (no transformation needed)
1101    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
1102    if is_identity {
1103        return passthrough_stream(reader, writer);
1104    }
1105
1106    // Try SIMD fast path for range translations (in-place, single buffer)
1107    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
1108        return translate_range_stream(lo, hi, offset, reader, writer);
1109    }
1110
1111    // General case: IN-PLACE translation on a SINGLE 4MB buffer.
1112    // This halves memory bandwidth vs the old separate src/dst approach:
1113    // - Old: read into src (4MB), translate from src→dst (read 4MB + write 4MB), write dst (4MB) = 12MB
1114    // - New: read into buf (4MB), translate in-place (read+write 4MB), write buf (4MB) = 8MB
1115    // The 8x-unrolled in-place translate avoids store-to-load forwarding stalls
1116    // because consecutive reads are 8 bytes apart (sequential), not aliased.
1117    // Using 4MB buffer (vs 1MB) reduces syscall count from 10 to 3 for 10MB.
1118    let mut buf = vec![0u8; STREAM_BUF];
1119    loop {
1120        let n = read_full(reader, &mut buf)?;
1121        if n == 0 {
1122            break;
1123        }
1124        translate_inplace(&mut buf[..n], &table);
1125        writer.write_all(&buf[..n])?;
1126    }
1127    Ok(())
1128}
1129
1130/// Streaming SIMD range translation — single buffer, in-place transform.
1131/// Uses 4MB buffer for fewer syscalls (translate is compute-light).
1132fn translate_range_stream(
1133    lo: u8,
1134    hi: u8,
1135    offset: i8,
1136    reader: &mut impl Read,
1137    writer: &mut impl Write,
1138) -> io::Result<()> {
1139    let mut buf = vec![0u8; STREAM_BUF];
1140    loop {
1141        let n = read_full(reader, &mut buf)?;
1142        if n == 0 {
1143            break;
1144        }
1145        translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
1146        writer.write_all(&buf[..n])?;
1147    }
1148    Ok(())
1149}
1150
1151/// Pure passthrough: copy stdin to stdout without transformation.
1152/// Uses a single 4MB buffer with direct read/write, no processing overhead.
1153fn passthrough_stream(reader: &mut impl Read, writer: &mut impl Write) -> io::Result<()> {
1154    let mut buf = vec![0u8; STREAM_BUF];
1155    loop {
1156        let n = read_full(reader, &mut buf)?;
1157        if n == 0 {
1158            break;
1159        }
1160        writer.write_all(&buf[..n])?;
1161    }
1162    Ok(())
1163}
1164
1165/// Read as many bytes as possible into buf, retrying on partial reads.
1166/// Fast path: first read() often fills the entire buffer for regular files.
1167#[inline]
1168fn read_full(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
1169    // Fast path: first read() usually fills the entire buffer for regular files
1170    let n = reader.read(buf)?;
1171    if n == buf.len() || n == 0 {
1172        return Ok(n);
1173    }
1174    // Slow path: partial read — retry to fill buffer (pipes, slow devices)
1175    let mut total = n;
1176    while total < buf.len() {
1177        match reader.read(&mut buf[total..]) {
1178            Ok(0) => break,
1179            Ok(n) => total += n,
1180            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
1181            Err(e) => return Err(e),
1182        }
1183    }
1184    Ok(total)
1185}
1186
1187pub fn translate_squeeze(
1188    set1: &[u8],
1189    set2: &[u8],
1190    reader: &mut impl Read,
1191    writer: &mut impl Write,
1192) -> io::Result<()> {
1193    let table = build_translate_table(set1, set2);
1194    let squeeze_set = build_member_set(set2);
1195
1196    // Two-pass optimization for range translations:
1197    // Pass 1: SIMD range translate in-place (10x faster than scalar table lookup)
1198    // Pass 2: scalar squeeze (inherently sequential due to state dependency)
1199    // Even though it's two passes, the translate pass is so much faster with SIMD
1200    // that the total is still a net win.
1201    let range_info = detect_range_offset(&table);
1202
1203    let mut buf = vec![0u8; STREAM_BUF];
1204    let mut last_squeezed: u16 = 256;
1205
1206    loop {
1207        let n = read_full(reader, &mut buf)?;
1208        if n == 0 {
1209            break;
1210        }
1211        // Pass 1: translate
1212        if let Some((lo, hi, offset)) = range_info {
1213            translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
1214        } else {
1215            translate_inplace(&mut buf[..n], &table);
1216        }
1217        // Pass 2: squeeze in-place
1218        let mut wp = 0;
1219        unsafe {
1220            let ptr = buf.as_mut_ptr();
1221            for i in 0..n {
1222                let b = *ptr.add(i);
1223                if is_member(&squeeze_set, b) {
1224                    if last_squeezed == b as u16 {
1225                        continue;
1226                    }
1227                    last_squeezed = b as u16;
1228                } else {
1229                    last_squeezed = 256;
1230                }
1231                *ptr.add(wp) = b;
1232                wp += 1;
1233            }
1234        }
1235        writer.write_all(&buf[..wp])?;
1236    }
1237    Ok(())
1238}
1239
1240pub fn delete(
1241    delete_chars: &[u8],
1242    reader: &mut impl Read,
1243    writer: &mut impl Write,
1244) -> io::Result<()> {
1245    if delete_chars.len() == 1 {
1246        return delete_single_streaming(delete_chars[0], reader, writer);
1247    }
1248    if delete_chars.len() <= 3 {
1249        return delete_multi_streaming(delete_chars, reader, writer);
1250    }
1251
1252    // SIMD fast path: if all delete chars form a contiguous range [lo..=hi],
1253    // use vectorized range comparison instead of scalar bitset lookup.
1254    // This covers [:digit:] (0x30-0x39), a-z, A-Z, etc.
1255    if let Some((lo, hi)) = detect_delete_range(delete_chars) {
1256        return delete_range_streaming(lo, hi, reader, writer);
1257    }
1258
1259    let member = build_member_set(delete_chars);
1260    let mut buf = vec![0u8; STREAM_BUF];
1261
1262    loop {
1263        let n = read_full(reader, &mut buf)?;
1264        if n == 0 {
1265            break;
1266        }
1267        let mut wp = 0;
1268        unsafe {
1269            let ptr = buf.as_mut_ptr();
1270            let mut i = 0;
1271            while i + 8 <= n {
1272                let b0 = *ptr.add(i);
1273                let b1 = *ptr.add(i + 1);
1274                let b2 = *ptr.add(i + 2);
1275                let b3 = *ptr.add(i + 3);
1276                let b4 = *ptr.add(i + 4);
1277                let b5 = *ptr.add(i + 5);
1278                let b6 = *ptr.add(i + 6);
1279                let b7 = *ptr.add(i + 7);
1280
1281                // Branchless: write byte then conditionally advance pointer.
1282                // Avoids branch mispredictions when most bytes are kept.
1283                *ptr.add(wp) = b0;
1284                wp += !is_member(&member, b0) as usize;
1285                *ptr.add(wp) = b1;
1286                wp += !is_member(&member, b1) as usize;
1287                *ptr.add(wp) = b2;
1288                wp += !is_member(&member, b2) as usize;
1289                *ptr.add(wp) = b3;
1290                wp += !is_member(&member, b3) as usize;
1291                *ptr.add(wp) = b4;
1292                wp += !is_member(&member, b4) as usize;
1293                *ptr.add(wp) = b5;
1294                wp += !is_member(&member, b5) as usize;
1295                *ptr.add(wp) = b6;
1296                wp += !is_member(&member, b6) as usize;
1297                *ptr.add(wp) = b7;
1298                wp += !is_member(&member, b7) as usize;
1299                i += 8;
1300            }
1301            while i < n {
1302                let b = *ptr.add(i);
1303                *ptr.add(wp) = b;
1304                wp += !is_member(&member, b) as usize;
1305                i += 1;
1306            }
1307        }
1308        writer.write_all(&buf[..wp])?;
1309    }
1310    Ok(())
1311}
1312
1313fn delete_single_streaming(
1314    ch: u8,
1315    reader: &mut impl Read,
1316    writer: &mut impl Write,
1317) -> io::Result<()> {
1318    let mut src = vec![0u8; STREAM_BUF];
1319    let mut dst = alloc_uninit_vec(STREAM_BUF);
1320    loop {
1321        let n = read_full(reader, &mut src)?;
1322        if n == 0 {
1323            break;
1324        }
1325        // Use memchr to find byte positions, gap-copy to separate dst buffer.
1326        // Separate src/dst allows copy_nonoverlapping (faster than memmove)
1327        // and avoids aliasing concerns in the hot loop.
1328        let mut wp = 0;
1329        let mut i = 0;
1330        while i < n {
1331            match memchr::memchr(ch, &src[i..n]) {
1332                Some(offset) => {
1333                    if offset > 0 {
1334                        unsafe {
1335                            std::ptr::copy_nonoverlapping(
1336                                src.as_ptr().add(i),
1337                                dst.as_mut_ptr().add(wp),
1338                                offset,
1339                            );
1340                        }
1341                        wp += offset;
1342                    }
1343                    i += offset + 1;
1344                }
1345                None => {
1346                    let run_len = n - i;
1347                    if run_len > 0 {
1348                        unsafe {
1349                            std::ptr::copy_nonoverlapping(
1350                                src.as_ptr().add(i),
1351                                dst.as_mut_ptr().add(wp),
1352                                run_len,
1353                            );
1354                        }
1355                        wp += run_len;
1356                    }
1357                    break;
1358                }
1359            }
1360        }
1361        // If nothing was deleted, write from src directly (avoids extra copy)
1362        if wp == n {
1363            writer.write_all(&src[..n])?;
1364        } else if wp > 0 {
1365            writer.write_all(&dst[..wp])?;
1366        }
1367    }
1368    Ok(())
1369}
1370
1371fn delete_multi_streaming(
1372    chars: &[u8],
1373    reader: &mut impl Read,
1374    writer: &mut impl Write,
1375) -> io::Result<()> {
1376    let mut src = vec![0u8; STREAM_BUF];
1377    let mut dst = alloc_uninit_vec(STREAM_BUF);
1378    loop {
1379        let n = read_full(reader, &mut src)?;
1380        if n == 0 {
1381            break;
1382        }
1383        // Use memchr2/memchr3 to find byte positions, gap-copy to separate dst buffer.
1384        // Separate src/dst allows copy_nonoverlapping (faster than memmove).
1385        let mut wp = 0;
1386        let mut i = 0;
1387        while i < n {
1388            let found = if chars.len() == 2 {
1389                memchr::memchr2(chars[0], chars[1], &src[i..n])
1390            } else {
1391                memchr::memchr3(chars[0], chars[1], chars[2], &src[i..n])
1392            };
1393            match found {
1394                Some(offset) => {
1395                    if offset > 0 {
1396                        unsafe {
1397                            std::ptr::copy_nonoverlapping(
1398                                src.as_ptr().add(i),
1399                                dst.as_mut_ptr().add(wp),
1400                                offset,
1401                            );
1402                        }
1403                        wp += offset;
1404                    }
1405                    i += offset + 1;
1406                }
1407                None => {
1408                    let run_len = n - i;
1409                    if run_len > 0 {
1410                        unsafe {
1411                            std::ptr::copy_nonoverlapping(
1412                                src.as_ptr().add(i),
1413                                dst.as_mut_ptr().add(wp),
1414                                run_len,
1415                            );
1416                        }
1417                        wp += run_len;
1418                    }
1419                    break;
1420                }
1421            }
1422        }
1423        if wp == n {
1424            writer.write_all(&src[..n])?;
1425        } else if wp > 0 {
1426            writer.write_all(&dst[..wp])?;
1427        }
1428    }
1429    Ok(())
1430}
1431
1432pub fn delete_squeeze(
1433    delete_chars: &[u8],
1434    squeeze_chars: &[u8],
1435    reader: &mut impl Read,
1436    writer: &mut impl Write,
1437) -> io::Result<()> {
1438    let delete_set = build_member_set(delete_chars);
1439    let squeeze_set = build_member_set(squeeze_chars);
1440    let mut buf = vec![0u8; STREAM_BUF];
1441    let mut last_squeezed: u16 = 256;
1442
1443    loop {
1444        let n = read_full(reader, &mut buf)?;
1445        if n == 0 {
1446            break;
1447        }
1448        let mut wp = 0;
1449        unsafe {
1450            let ptr = buf.as_mut_ptr();
1451            for i in 0..n {
1452                let b = *ptr.add(i);
1453                if is_member(&delete_set, b) {
1454                    continue;
1455                }
1456                if is_member(&squeeze_set, b) {
1457                    if last_squeezed == b as u16 {
1458                        continue;
1459                    }
1460                    last_squeezed = b as u16;
1461                } else {
1462                    last_squeezed = 256;
1463                }
1464                *ptr.add(wp) = b;
1465                wp += 1;
1466            }
1467        }
1468        writer.write_all(&buf[..wp])?;
1469    }
1470    Ok(())
1471}
1472
1473pub fn squeeze(
1474    squeeze_chars: &[u8],
1475    reader: &mut impl Read,
1476    writer: &mut impl Write,
1477) -> io::Result<()> {
1478    if squeeze_chars.len() == 1 {
1479        return squeeze_single_stream(squeeze_chars[0], reader, writer);
1480    }
1481
1482    let member = build_member_set(squeeze_chars);
1483    let mut buf = vec![0u8; STREAM_BUF];
1484    let mut last_squeezed: u16 = 256;
1485
1486    loop {
1487        let n = read_full(reader, &mut buf)?;
1488        if n == 0 {
1489            break;
1490        }
1491        let mut wp = 0;
1492        unsafe {
1493            let ptr = buf.as_mut_ptr();
1494            for i in 0..n {
1495                let b = *ptr.add(i);
1496                if is_member(&member, b) {
1497                    if last_squeezed == b as u16 {
1498                        continue;
1499                    }
1500                    last_squeezed = b as u16;
1501                } else {
1502                    last_squeezed = 256;
1503                }
1504                *ptr.add(wp) = b;
1505                wp += 1;
1506            }
1507        }
1508        writer.write_all(&buf[..wp])?;
1509    }
1510    Ok(())
1511}
1512
1513fn squeeze_single_stream(
1514    ch: u8,
1515    reader: &mut impl Read,
1516    writer: &mut impl Write,
1517) -> io::Result<()> {
1518    // Use a two-byte pattern finder (ch,ch) to jump directly to squeeze points.
1519    // For squeeze-spaces: most of the data has no consecutive spaces, so memmem
1520    // skips over huge regions at SIMD speed. When a pair is found, we scan the
1521    // run length and collapse it to one occurrence.
1522    let pair = [ch, ch];
1523    let finder = memchr::memmem::Finder::new(&pair);
1524    let mut buf = vec![0u8; STREAM_BUF];
1525    let mut was_squeeze_char = false;
1526
1527    loop {
1528        let n = read_full(reader, &mut buf)?;
1529        if n == 0 {
1530            break;
1531        }
1532
1533        let ptr = buf.as_mut_ptr();
1534        let mut wp = 0;
1535        let mut i = 0;
1536
1537        // Handle carry-over: if previous chunk ended with squeeze char,
1538        // skip leading occurrences of that char in this chunk.
1539        if was_squeeze_char {
1540            while i < n && unsafe { *ptr.add(i) } == ch {
1541                i += 1;
1542            }
1543            if i >= n {
1544                // Entire chunk is squeeze-char continuation
1545                // was_squeeze_char remains true
1546                continue;
1547            }
1548        }
1549
1550        // Use memmem to find consecutive pairs (ch, ch) — SIMD-accelerated.
1551        // Between pairs, the data passes through unchanged.
1552        loop {
1553            match finder.find(&buf[i..n]) {
1554                Some(offset) => {
1555                    // Copy everything up to and including the first char of the pair
1556                    let copy_len = offset + 1; // include first ch
1557                    if copy_len > 0 && wp != i {
1558                        unsafe {
1559                            std::ptr::copy(ptr.add(i), ptr.add(wp), copy_len);
1560                        }
1561                    }
1562                    wp += copy_len;
1563                    i += offset + 1; // position after first ch of pair
1564                    // Skip all remaining consecutive ch bytes (the run)
1565                    while i < n && unsafe { *ptr.add(i) } == ch {
1566                        i += 1;
1567                    }
1568                    if i >= n {
1569                        was_squeeze_char = true;
1570                        break;
1571                    }
1572                }
1573                None => {
1574                    // No more consecutive pairs — copy remainder
1575                    let run_len = n - i;
1576                    if run_len > 0 && wp != i {
1577                        unsafe {
1578                            std::ptr::copy(ptr.add(i), ptr.add(wp), run_len);
1579                        }
1580                    }
1581                    wp += run_len;
1582                    // Check if chunk ends with the squeeze char
1583                    was_squeeze_char = n > 0 && unsafe { *ptr.add(n - 1) } == ch;
1584                    break;
1585                }
1586            }
1587        }
1588
1589        writer.write_all(&buf[..wp])?;
1590    }
1591    Ok(())
1592}
1593
1594// ============================================================================
1595// Batch in-place functions (owned data from piped stdin)
1596// ============================================================================
1597
1598/// Translate bytes in-place on an owned buffer, then write.
1599/// For piped stdin where we own the data, this avoids the separate output buffer
1600/// allocation needed by translate_mmap. Uses parallel in-place SIMD for large data.
1601pub fn translate_owned(
1602    set1: &[u8],
1603    set2: &[u8],
1604    data: &mut [u8],
1605    writer: &mut impl Write,
1606) -> io::Result<()> {
1607    let table = build_translate_table(set1, set2);
1608
1609    // Identity table — pure passthrough
1610    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
1611    if is_identity {
1612        return writer.write_all(data);
1613    }
1614
1615    // SIMD range fast path (in-place)
1616    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
1617        if data.len() >= PARALLEL_THRESHOLD {
1618            let n_threads = rayon::current_num_threads().max(1);
1619            let chunk_size = (data.len() / n_threads).max(32 * 1024);
1620            data.par_chunks_mut(chunk_size).for_each(|chunk| {
1621                translate_range_simd_inplace(chunk, lo, hi, offset);
1622            });
1623        } else {
1624            translate_range_simd_inplace(data, lo, hi, offset);
1625        }
1626        return writer.write_all(data);
1627    }
1628
1629    // General table lookup (in-place)
1630    if data.len() >= PARALLEL_THRESHOLD {
1631        let n_threads = rayon::current_num_threads().max(1);
1632        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1633        data.par_chunks_mut(chunk_size).for_each(|chunk| {
1634            translate_inplace(chunk, &table);
1635        });
1636    } else {
1637        translate_inplace(data, &table);
1638    }
1639    writer.write_all(data)
1640}
1641
1642// ============================================================================
1643// Mmap-based functions (zero-copy input from byte slice)
1644// ============================================================================
1645
1646/// Maximum data size for single-allocation translate approach.
1647/// Below this limit, translate ALL data into one buffer and do a single write_all.
1648/// Above this, use chunked approach to limit memory usage.
1649const SINGLE_WRITE_LIMIT: usize = 16 * 1024 * 1024;
1650
1651/// Translate bytes from an mmap'd byte slice.
1652/// Detects single-range translations (e.g., a-z to A-Z) and uses SIMD vectorized
1653/// arithmetic (AVX2: 32 bytes/iter, SSE2: 16 bytes/iter) for those cases.
1654/// Falls back to scalar 256-byte table lookup for general translations.
1655///
1656/// For data >= 2MB: uses rayon parallel processing across multiple cores.
1657/// For data <= 16MB: single allocation + single write_all (1 syscall).
1658/// For data > 16MB: chunked approach to limit memory (N syscalls where N = data/4MB).
1659pub fn translate_mmap(
1660    set1: &[u8],
1661    set2: &[u8],
1662    data: &[u8],
1663    writer: &mut impl Write,
1664) -> io::Result<()> {
1665    let table = build_translate_table(set1, set2);
1666
1667    // Check if table is identity — pure passthrough
1668    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
1669    if is_identity {
1670        return writer.write_all(data);
1671    }
1672
1673    // Try SIMD fast path for single-range constant-offset translations
1674    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
1675        return translate_mmap_range(data, writer, lo, hi, offset);
1676    }
1677
1678    // General case: table lookup (with parallel processing for large data)
1679    translate_mmap_table(data, writer, &table)
1680}
1681
1682/// SIMD range translate for mmap data, with rayon parallel processing.
1683fn translate_mmap_range(
1684    data: &[u8],
1685    writer: &mut impl Write,
1686    lo: u8,
1687    hi: u8,
1688    offset: i8,
1689) -> io::Result<()> {
1690    // Parallel path: split data into chunks, translate each in parallel
1691    if data.len() >= PARALLEL_THRESHOLD {
1692        let mut buf = alloc_uninit_vec(data.len());
1693        let n_threads = rayon::current_num_threads().max(1);
1694        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1695
1696        // Process chunks in parallel: each thread writes to its slice of buf
1697        data.par_chunks(chunk_size)
1698            .zip(buf.par_chunks_mut(chunk_size))
1699            .for_each(|(src_chunk, dst_chunk)| {
1700                translate_range_simd(src_chunk, &mut dst_chunk[..src_chunk.len()], lo, hi, offset);
1701            });
1702
1703        return writer.write_all(&buf);
1704    }
1705
1706    // Small data: single-threaded SIMD
1707    if data.len() <= SINGLE_WRITE_LIMIT {
1708        let mut buf = alloc_uninit_vec(data.len());
1709        translate_range_simd(data, &mut buf, lo, hi, offset);
1710        return writer.write_all(&buf);
1711    }
1712    // Chunked path for large data (shouldn't happen since PARALLEL_THRESHOLD < SINGLE_WRITE_LIMIT)
1713    let mut buf = alloc_uninit_vec(BUF_SIZE);
1714    for chunk in data.chunks(BUF_SIZE) {
1715        translate_range_simd(chunk, &mut buf[..chunk.len()], lo, hi, offset);
1716        writer.write_all(&buf[..chunk.len()])?;
1717    }
1718    Ok(())
1719}
1720
1721/// General table-lookup translate for mmap data, with rayon parallel processing.
1722fn translate_mmap_table(data: &[u8], writer: &mut impl Write, table: &[u8; 256]) -> io::Result<()> {
1723    // Parallel path: split data into chunks, translate each in parallel
1724    if data.len() >= PARALLEL_THRESHOLD {
1725        let mut buf = alloc_uninit_vec(data.len());
1726        let n_threads = rayon::current_num_threads().max(1);
1727        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1728
1729        data.par_chunks(chunk_size)
1730            .zip(buf.par_chunks_mut(chunk_size))
1731            .for_each(|(src_chunk, dst_chunk)| {
1732                translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], table);
1733            });
1734
1735        return writer.write_all(&buf);
1736    }
1737
1738    // Small data: single-threaded
1739    if data.len() <= SINGLE_WRITE_LIMIT {
1740        let mut buf = alloc_uninit_vec(data.len());
1741        translate_to(data, &mut buf, table);
1742        return writer.write_all(&buf);
1743    }
1744    let mut buf = alloc_uninit_vec(BUF_SIZE);
1745    for chunk in data.chunks(BUF_SIZE) {
1746        translate_to(chunk, &mut buf[..chunk.len()], table);
1747        writer.write_all(&buf[..chunk.len()])?;
1748    }
1749    Ok(())
1750}
1751
1752/// Translate + squeeze from mmap'd byte slice.
1753///
1754/// For data >= 2MB: two-phase approach: parallel translate, then sequential squeeze.
1755/// For data <= 16MB: single-pass translate+squeeze into one buffer, one write syscall.
1756/// For data > 16MB: chunked approach to limit memory.
1757pub fn translate_squeeze_mmap(
1758    set1: &[u8],
1759    set2: &[u8],
1760    data: &[u8],
1761    writer: &mut impl Write,
1762) -> io::Result<()> {
1763    let table = build_translate_table(set1, set2);
1764    let squeeze_set = build_member_set(set2);
1765
1766    // For large data: two-phase approach
1767    // Phase 1: parallel translate into buffer
1768    // Phase 2: sequential squeeze IN-PLACE on the translated buffer
1769    //          (squeeze only removes bytes, never grows, so no second allocation needed)
1770    if data.len() >= PARALLEL_THRESHOLD {
1771        // Phase 1: parallel translate
1772        let mut translated = alloc_uninit_vec(data.len());
1773        let range_info = detect_range_offset(&table);
1774        let n_threads = rayon::current_num_threads().max(1);
1775        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1776
1777        if let Some((lo, hi, offset)) = range_info {
1778            data.par_chunks(chunk_size)
1779                .zip(translated.par_chunks_mut(chunk_size))
1780                .for_each(|(src_chunk, dst_chunk)| {
1781                    translate_range_simd(
1782                        src_chunk,
1783                        &mut dst_chunk[..src_chunk.len()],
1784                        lo,
1785                        hi,
1786                        offset,
1787                    );
1788                });
1789        } else {
1790            data.par_chunks(chunk_size)
1791                .zip(translated.par_chunks_mut(chunk_size))
1792                .for_each(|(src_chunk, dst_chunk)| {
1793                    translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], &table);
1794                });
1795        }
1796
1797        // Phase 2: squeeze in-place on the translated buffer.
1798        // Since squeeze only removes bytes (never grows), we can read ahead and
1799        // compact into the same buffer, saving a full data.len() heap allocation.
1800        let mut last_squeezed: u16 = 256;
1801        let len = translated.len();
1802        let mut wp = 0;
1803        unsafe {
1804            let ptr = translated.as_mut_ptr();
1805            let mut i = 0;
1806            while i < len {
1807                let b = *ptr.add(i);
1808                if is_member(&squeeze_set, b) {
1809                    if last_squeezed == b as u16 {
1810                        i += 1;
1811                        continue;
1812                    }
1813                    last_squeezed = b as u16;
1814                } else {
1815                    last_squeezed = 256;
1816                }
1817                *ptr.add(wp) = b;
1818                wp += 1;
1819                i += 1;
1820            }
1821        }
1822        return writer.write_all(&translated[..wp]);
1823    }
1824
1825    // Single-write fast path: translate+squeeze all data in one pass, one write
1826    if data.len() <= SINGLE_WRITE_LIMIT {
1827        let mut buf: Vec<u8> = Vec::with_capacity(data.len());
1828        let mut last_squeezed: u16 = 256;
1829        unsafe {
1830            buf.set_len(data.len());
1831            let outp: *mut u8 = buf.as_mut_ptr();
1832            let inp = data.as_ptr();
1833            let len = data.len();
1834            let mut wp = 0;
1835            let mut i = 0;
1836            while i < len {
1837                let translated = *table.get_unchecked(*inp.add(i) as usize);
1838                if is_member(&squeeze_set, translated) {
1839                    if last_squeezed == translated as u16 {
1840                        i += 1;
1841                        continue;
1842                    }
1843                    last_squeezed = translated as u16;
1844                } else {
1845                    last_squeezed = 256;
1846                }
1847                *outp.add(wp) = translated;
1848                wp += 1;
1849                i += 1;
1850            }
1851            buf.set_len(wp);
1852        }
1853        return writer.write_all(&buf);
1854    }
1855
1856    // Chunked path for large data
1857    let buf_size = data.len().min(BUF_SIZE);
1858    let mut buf = vec![0u8; buf_size];
1859    let mut last_squeezed: u16 = 256;
1860
1861    for chunk in data.chunks(buf_size) {
1862        translate_to(chunk, &mut buf[..chunk.len()], &table);
1863        let mut wp = 0;
1864        unsafe {
1865            let ptr = buf.as_mut_ptr();
1866            for i in 0..chunk.len() {
1867                let b = *ptr.add(i);
1868                if is_member(&squeeze_set, b) {
1869                    if last_squeezed == b as u16 {
1870                        continue;
1871                    }
1872                    last_squeezed = b as u16;
1873                } else {
1874                    last_squeezed = 256;
1875                }
1876                *ptr.add(wp) = b;
1877                wp += 1;
1878            }
1879        }
1880        writer.write_all(&buf[..wp])?;
1881    }
1882    Ok(())
1883}
1884
1885/// Delete from mmap'd byte slice.
1886///
1887/// For data >= 2MB: uses rayon parallel processing across multiple cores.
1888/// For data <= 16MB: delete into one buffer, one write syscall.
1889/// For data > 16MB: chunked approach to limit memory.
1890pub fn delete_mmap(delete_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1891    if delete_chars.len() == 1 {
1892        return delete_single_char_mmap(delete_chars[0], data, writer);
1893    }
1894    if delete_chars.len() <= 3 {
1895        return delete_multi_memchr_mmap(delete_chars, data, writer);
1896    }
1897
1898    // SIMD fast path for contiguous ranges (digits, a-z, A-Z, etc.)
1899    if let Some((lo, hi)) = detect_delete_range(delete_chars) {
1900        return delete_range_mmap(data, writer, lo, hi);
1901    }
1902
1903    let member = build_member_set(delete_chars);
1904
1905    // Parallel path: pre-allocate a single output buffer of data.len() and have each
1906    // thread write to its non-overlapping slice, then do a single write_all.
1907    // This avoids per-chunk Vec allocations that the old approach had.
1908    if data.len() >= PARALLEL_THRESHOLD {
1909        let n_threads = rayon::current_num_threads().max(1);
1910        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1911
1912        // Each thread deletes into its slice of outbuf and returns bytes written.
1913        let mut outbuf = alloc_uninit_vec(data.len());
1914        let chunk_lens: Vec<usize> = data
1915            .par_chunks(chunk_size)
1916            .zip(outbuf.par_chunks_mut(chunk_size))
1917            .map(|(src_chunk, dst_chunk)| delete_chunk_bitset_into(src_chunk, &member, dst_chunk))
1918            .collect();
1919
1920        // Compact: move each chunk's output to be contiguous.
1921        // chunk_lens[i] is how many bytes thread i wrote into its slice.
1922        // We need to shift them together since each dst_chunk started at chunk_size offsets.
1923        let mut write_pos = 0;
1924        let mut src_offset = 0;
1925        for &clen in &chunk_lens {
1926            if clen > 0 && src_offset != write_pos {
1927                unsafe {
1928                    std::ptr::copy(
1929                        outbuf.as_ptr().add(src_offset),
1930                        outbuf.as_mut_ptr().add(write_pos),
1931                        clen,
1932                    );
1933                }
1934            }
1935            write_pos += clen;
1936            src_offset += chunk_size;
1937        }
1938
1939        return writer.write_all(&outbuf[..write_pos]);
1940    }
1941
1942    // Single-write fast path: delete into one buffer, one write
1943    if data.len() <= SINGLE_WRITE_LIMIT {
1944        let mut outbuf = alloc_uninit_vec(data.len());
1945        let out_pos = delete_chunk_bitset_into(data, &member, &mut outbuf);
1946        return writer.write_all(&outbuf[..out_pos]);
1947    }
1948
1949    // Chunked path for large data
1950    let buf_size = data.len().min(BUF_SIZE);
1951    let mut outbuf = alloc_uninit_vec(buf_size);
1952
1953    for chunk in data.chunks(buf_size) {
1954        let out_pos = delete_chunk_bitset_into(chunk, &member, &mut outbuf);
1955        writer.write_all(&outbuf[..out_pos])?;
1956    }
1957    Ok(())
1958}
1959
1960/// SIMD range delete for mmap data, with rayon parallel processing.
1961fn delete_range_mmap(data: &[u8], writer: &mut impl Write, lo: u8, hi: u8) -> io::Result<()> {
1962    // Parallel path: each thread deletes from its chunk into a local Vec
1963    if data.len() >= PARALLEL_THRESHOLD {
1964        let n_threads = rayon::current_num_threads().max(1);
1965        let chunk_size = (data.len() / n_threads).max(32 * 1024);
1966
1967        let results: Vec<Vec<u8>> = data
1968            .par_chunks(chunk_size)
1969            .map(|chunk| {
1970                let mut out = alloc_uninit_vec(chunk.len());
1971                let wp = delete_range_chunk(chunk, &mut out, lo, hi);
1972                unsafe { out.set_len(wp) };
1973                out
1974            })
1975            .collect();
1976
1977        let slices: Vec<std::io::IoSlice> = results
1978            .iter()
1979            .filter(|r| !r.is_empty())
1980            .map(|r| std::io::IoSlice::new(r))
1981            .collect();
1982        return write_ioslices(writer, &slices);
1983    }
1984
1985    // Single-write fast path
1986    if data.len() <= SINGLE_WRITE_LIMIT {
1987        let mut outbuf = alloc_uninit_vec(data.len());
1988        let wp = delete_range_chunk(data, &mut outbuf, lo, hi);
1989        return writer.write_all(&outbuf[..wp]);
1990    }
1991
1992    // Chunked path
1993    let mut outbuf = alloc_uninit_vec(BUF_SIZE);
1994    for chunk in data.chunks(BUF_SIZE) {
1995        let wp = delete_range_chunk(chunk, &mut outbuf, lo, hi);
1996        writer.write_all(&outbuf[..wp])?;
1997    }
1998    Ok(())
1999}
2000
2001/// Delete bytes from chunk using bitset, writing into pre-allocated buffer.
2002/// Returns number of bytes written.
2003#[inline]
2004fn delete_chunk_bitset_into(chunk: &[u8], member: &[u8; 32], outbuf: &mut [u8]) -> usize {
2005    let len = chunk.len();
2006    let mut out_pos = 0;
2007    let mut i = 0;
2008
2009    while i + 8 <= len {
2010        unsafe {
2011            let b0 = *chunk.get_unchecked(i);
2012            let b1 = *chunk.get_unchecked(i + 1);
2013            let b2 = *chunk.get_unchecked(i + 2);
2014            let b3 = *chunk.get_unchecked(i + 3);
2015            let b4 = *chunk.get_unchecked(i + 4);
2016            let b5 = *chunk.get_unchecked(i + 5);
2017            let b6 = *chunk.get_unchecked(i + 6);
2018            let b7 = *chunk.get_unchecked(i + 7);
2019
2020            *outbuf.get_unchecked_mut(out_pos) = b0;
2021            out_pos += !is_member(member, b0) as usize;
2022            *outbuf.get_unchecked_mut(out_pos) = b1;
2023            out_pos += !is_member(member, b1) as usize;
2024            *outbuf.get_unchecked_mut(out_pos) = b2;
2025            out_pos += !is_member(member, b2) as usize;
2026            *outbuf.get_unchecked_mut(out_pos) = b3;
2027            out_pos += !is_member(member, b3) as usize;
2028            *outbuf.get_unchecked_mut(out_pos) = b4;
2029            out_pos += !is_member(member, b4) as usize;
2030            *outbuf.get_unchecked_mut(out_pos) = b5;
2031            out_pos += !is_member(member, b5) as usize;
2032            *outbuf.get_unchecked_mut(out_pos) = b6;
2033            out_pos += !is_member(member, b6) as usize;
2034            *outbuf.get_unchecked_mut(out_pos) = b7;
2035            out_pos += !is_member(member, b7) as usize;
2036        }
2037        i += 8;
2038    }
2039
2040    while i < len {
2041        unsafe {
2042            let b = *chunk.get_unchecked(i);
2043            *outbuf.get_unchecked_mut(out_pos) = b;
2044            out_pos += !is_member(member, b) as usize;
2045        }
2046        i += 1;
2047    }
2048
2049    out_pos
2050}
2051
2052fn delete_single_char_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2053    // Zero-copy delete using writev: build IoSlice entries pointing to the
2054    // gaps between deleted characters in the ORIGINAL data buffer.
2055    // For `tr -d '\n'` on 10MB with ~200K newlines:
2056    //   - Old: 10MB allocation + 10MB copy into output buffer
2057    //   - New: ~200K * 16 = 3.2MB IoSlice entries, zero data copy
2058    // Uses SIMD memchr_iter to find all positions, then builds IoSlice spans.
2059    let mut iov: Vec<std::io::IoSlice> = Vec::new();
2060    let mut last = 0;
2061    for pos in memchr::memchr_iter(ch, data) {
2062        if pos > last {
2063            iov.push(std::io::IoSlice::new(&data[last..pos]));
2064        }
2065        last = pos + 1;
2066    }
2067    if last < data.len() {
2068        iov.push(std::io::IoSlice::new(&data[last..]));
2069    }
2070    write_ioslices(writer, &iov)
2071}
2072
2073fn delete_multi_memchr_mmap(chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2074    let c0 = chars[0];
2075    let c1 = if chars.len() >= 2 { chars[1] } else { 0 };
2076    let c2 = if chars.len() >= 3 { chars[2] } else { 0 };
2077    let is_three = chars.len() >= 3;
2078
2079    // Zero-copy delete using writev: build IoSlice entries pointing to the
2080    // gaps between deleted characters in the original data buffer.
2081    let mut iov: Vec<std::io::IoSlice> = Vec::new();
2082    let mut last = 0;
2083    if is_three {
2084        for pos in memchr::memchr3_iter(c0, c1, c2, data) {
2085            if pos > last {
2086                iov.push(std::io::IoSlice::new(&data[last..pos]));
2087            }
2088            last = pos + 1;
2089        }
2090    } else {
2091        for pos in memchr::memchr2_iter(c0, c1, data) {
2092            if pos > last {
2093                iov.push(std::io::IoSlice::new(&data[last..pos]));
2094            }
2095            last = pos + 1;
2096        }
2097    }
2098    if last < data.len() {
2099        iov.push(std::io::IoSlice::new(&data[last..]));
2100    }
2101    write_ioslices(writer, &iov)
2102}
2103
2104/// Delete + squeeze from mmap'd byte slice.
2105///
2106/// For data <= 16MB: delete+squeeze into one buffer, one write syscall.
2107/// For data > 16MB: chunked approach to limit memory.
2108pub fn delete_squeeze_mmap(
2109    delete_chars: &[u8],
2110    squeeze_chars: &[u8],
2111    data: &[u8],
2112    writer: &mut impl Write,
2113) -> io::Result<()> {
2114    let delete_set = build_member_set(delete_chars);
2115    let squeeze_set = build_member_set(squeeze_chars);
2116
2117    // Single-write fast path: delete+squeeze all data in one pass, one write
2118    if data.len() <= SINGLE_WRITE_LIMIT {
2119        let mut outbuf: Vec<u8> = Vec::with_capacity(data.len());
2120        let mut last_squeezed: u16 = 256;
2121        unsafe {
2122            outbuf.set_len(data.len());
2123            let outp: *mut u8 = outbuf.as_mut_ptr();
2124            let inp = data.as_ptr();
2125            let len = data.len();
2126            let mut out_pos = 0;
2127            let mut i = 0;
2128            while i < len {
2129                let b = *inp.add(i);
2130                if is_member(&delete_set, b) {
2131                    i += 1;
2132                    continue;
2133                }
2134                if is_member(&squeeze_set, b) {
2135                    if last_squeezed == b as u16 {
2136                        i += 1;
2137                        continue;
2138                    }
2139                    last_squeezed = b as u16;
2140                } else {
2141                    last_squeezed = 256;
2142                }
2143                *outp.add(out_pos) = b;
2144                out_pos += 1;
2145                i += 1;
2146            }
2147            outbuf.set_len(out_pos);
2148        }
2149        return writer.write_all(&outbuf);
2150    }
2151
2152    // Chunked path for large data
2153    let buf_size = data.len().min(BUF_SIZE);
2154    let mut outbuf = vec![0u8; buf_size];
2155    let mut last_squeezed: u16 = 256;
2156
2157    for chunk in data.chunks(buf_size) {
2158        let mut out_pos = 0;
2159        for &b in chunk {
2160            if is_member(&delete_set, b) {
2161                continue;
2162            }
2163            if is_member(&squeeze_set, b) {
2164                if last_squeezed == b as u16 {
2165                    continue;
2166                }
2167                last_squeezed = b as u16;
2168            } else {
2169                last_squeezed = 256;
2170            }
2171            unsafe {
2172                *outbuf.get_unchecked_mut(out_pos) = b;
2173            }
2174            out_pos += 1;
2175        }
2176        writer.write_all(&outbuf[..out_pos])?;
2177    }
2178    Ok(())
2179}
2180
2181/// Squeeze from mmap'd byte slice.
2182///
2183/// For data >= 2MB: uses rayon parallel processing with boundary fixup.
2184/// For data <= 16MB: squeeze into one buffer, one write syscall.
2185/// For data > 16MB: chunked approach to limit memory.
2186pub fn squeeze_mmap(squeeze_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2187    if squeeze_chars.len() == 1 {
2188        return squeeze_single_mmap(squeeze_chars[0], data, writer);
2189    }
2190    if squeeze_chars.len() == 2 {
2191        return squeeze_multi_mmap::<2>(squeeze_chars, data, writer);
2192    }
2193    if squeeze_chars.len() == 3 {
2194        return squeeze_multi_mmap::<3>(squeeze_chars, data, writer);
2195    }
2196
2197    let member = build_member_set(squeeze_chars);
2198
2199    // Parallel path: squeeze each chunk independently, then fix boundaries
2200    if data.len() >= PARALLEL_THRESHOLD {
2201        let n_threads = rayon::current_num_threads().max(1);
2202        let chunk_size = (data.len() / n_threads).max(32 * 1024);
2203
2204        let results: Vec<Vec<u8>> = data
2205            .par_chunks(chunk_size)
2206            .map(|chunk| squeeze_chunk_bitset(chunk, &member))
2207            .collect();
2208
2209        // Build IoSlice list, fixing boundaries: if chunk N ends with byte B
2210        // and chunk N+1 starts with same byte B, and B is in squeeze set,
2211        // skip the first byte(s) of chunk N+1 that equal B.
2212        // Collect slices for writev to minimize syscalls.
2213        let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
2214        for (idx, result) in results.iter().enumerate() {
2215            if result.is_empty() {
2216                continue;
2217            }
2218            if idx > 0 {
2219                // Check boundary: does previous chunk end with same squeezable byte?
2220                if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
2221                    if is_member(&member, prev_last) {
2222                        // Skip leading bytes in this chunk that equal prev_last
2223                        let skip = result.iter().take_while(|&&b| b == prev_last).count();
2224                        if skip < result.len() {
2225                            slices.push(std::io::IoSlice::new(&result[skip..]));
2226                        }
2227                        continue;
2228                    }
2229                }
2230            }
2231            slices.push(std::io::IoSlice::new(result));
2232        }
2233        return write_ioslices(writer, &slices);
2234    }
2235
2236    // Single-write fast path: squeeze all data into one buffer, one write
2237    if data.len() <= SINGLE_WRITE_LIMIT {
2238        let mut outbuf: Vec<u8> = Vec::with_capacity(data.len());
2239        let mut last_squeezed: u16 = 256;
2240        let len = data.len();
2241        let mut wp = 0;
2242        let mut i = 0;
2243
2244        unsafe {
2245            outbuf.set_len(data.len());
2246            let inp = data.as_ptr();
2247            let outp: *mut u8 = outbuf.as_mut_ptr();
2248
2249            while i < len {
2250                let b = *inp.add(i);
2251                if is_member(&member, b) {
2252                    if last_squeezed != b as u16 {
2253                        *outp.add(wp) = b;
2254                        wp += 1;
2255                        last_squeezed = b as u16;
2256                    }
2257                    i += 1;
2258                    while i < len && *inp.add(i) == b {
2259                        i += 1;
2260                    }
2261                } else {
2262                    last_squeezed = 256;
2263                    *outp.add(wp) = b;
2264                    wp += 1;
2265                    i += 1;
2266                }
2267            }
2268            outbuf.set_len(wp);
2269        }
2270        return writer.write_all(&outbuf);
2271    }
2272
2273    // Chunked path for large data
2274    let buf_size = data.len().min(BUF_SIZE);
2275    let mut outbuf = vec![0u8; buf_size];
2276    let mut last_squeezed: u16 = 256;
2277
2278    for chunk in data.chunks(buf_size) {
2279        let len = chunk.len();
2280        let mut wp = 0;
2281        let mut i = 0;
2282
2283        unsafe {
2284            let inp = chunk.as_ptr();
2285            let outp = outbuf.as_mut_ptr();
2286
2287            while i < len {
2288                let b = *inp.add(i);
2289                if is_member(&member, b) {
2290                    if last_squeezed != b as u16 {
2291                        *outp.add(wp) = b;
2292                        wp += 1;
2293                        last_squeezed = b as u16;
2294                    }
2295                    i += 1;
2296                    while i < len && *inp.add(i) == b {
2297                        i += 1;
2298                    }
2299                } else {
2300                    last_squeezed = 256;
2301                    *outp.add(wp) = b;
2302                    wp += 1;
2303                    i += 1;
2304                }
2305            }
2306        }
2307        writer.write_all(&outbuf[..wp])?;
2308    }
2309    Ok(())
2310}
2311
2312/// Squeeze a single chunk using bitset membership. Returns squeezed output.
2313fn squeeze_chunk_bitset(chunk: &[u8], member: &[u8; 32]) -> Vec<u8> {
2314    let len = chunk.len();
2315    let mut out = Vec::with_capacity(len);
2316    let mut last_squeezed: u16 = 256;
2317    let mut i = 0;
2318
2319    unsafe {
2320        out.set_len(len);
2321        let inp = chunk.as_ptr();
2322        let outp: *mut u8 = out.as_mut_ptr();
2323        let mut wp = 0;
2324
2325        while i < len {
2326            let b = *inp.add(i);
2327            if is_member(member, b) {
2328                if last_squeezed != b as u16 {
2329                    *outp.add(wp) = b;
2330                    wp += 1;
2331                    last_squeezed = b as u16;
2332                }
2333                i += 1;
2334                while i < len && *inp.add(i) == b {
2335                    i += 1;
2336                }
2337            } else {
2338                last_squeezed = 256;
2339                *outp.add(wp) = b;
2340                wp += 1;
2341                i += 1;
2342            }
2343        }
2344        out.set_len(wp);
2345    }
2346    out
2347}
2348
2349fn squeeze_multi_mmap<const N: usize>(
2350    chars: &[u8],
2351    data: &[u8],
2352    writer: &mut impl Write,
2353) -> io::Result<()> {
2354    // Parallel path for large data: squeeze each chunk, fix boundaries with writev
2355    if data.len() >= PARALLEL_THRESHOLD {
2356        let member = build_member_set(chars);
2357        let n_threads = rayon::current_num_threads().max(1);
2358        let chunk_size = (data.len() / n_threads).max(32 * 1024);
2359
2360        let results: Vec<Vec<u8>> = data
2361            .par_chunks(chunk_size)
2362            .map(|chunk| squeeze_chunk_bitset(chunk, &member))
2363            .collect();
2364
2365        // Build IoSlice list, fixing boundaries
2366        let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
2367        for (idx, result) in results.iter().enumerate() {
2368            if result.is_empty() {
2369                continue;
2370            }
2371            if idx > 0 {
2372                if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
2373                    if is_member(&member, prev_last) {
2374                        let skip = result.iter().take_while(|&&b| b == prev_last).count();
2375                        if skip < result.len() {
2376                            slices.push(std::io::IoSlice::new(&result[skip..]));
2377                        }
2378                        continue;
2379                    }
2380                }
2381            }
2382            slices.push(std::io::IoSlice::new(result));
2383        }
2384        return write_ioslices(writer, &slices);
2385    }
2386
2387    let buf_size = data.len().min(BUF_SIZE);
2388    let mut outbuf = vec![0u8; buf_size];
2389    let mut wp = 0;
2390    let mut last_squeezed: u16 = 256;
2391    let mut cursor = 0;
2392
2393    macro_rules! find_next {
2394        ($data:expr) => {
2395            if N == 2 {
2396                memchr::memchr2(chars[0], chars[1], $data)
2397            } else {
2398                memchr::memchr3(chars[0], chars[1], chars[2], $data)
2399            }
2400        };
2401    }
2402
2403    macro_rules! flush_and_copy {
2404        ($src:expr, $len:expr) => {
2405            if wp + $len > buf_size {
2406                writer.write_all(&outbuf[..wp])?;
2407                wp = 0;
2408            }
2409            if $len > buf_size {
2410                writer.write_all($src)?;
2411            } else {
2412                outbuf[wp..wp + $len].copy_from_slice($src);
2413                wp += $len;
2414            }
2415        };
2416    }
2417
2418    while cursor < data.len() {
2419        match find_next!(&data[cursor..]) {
2420            Some(offset) => {
2421                let pos = cursor + offset;
2422                let b = data[pos];
2423                if pos > cursor {
2424                    let span = pos - cursor;
2425                    flush_and_copy!(&data[cursor..pos], span);
2426                    last_squeezed = 256;
2427                }
2428                if last_squeezed != b as u16 {
2429                    if wp >= buf_size {
2430                        writer.write_all(&outbuf[..wp])?;
2431                        wp = 0;
2432                    }
2433                    outbuf[wp] = b;
2434                    wp += 1;
2435                    last_squeezed = b as u16;
2436                }
2437                let mut skip = pos + 1;
2438                while skip < data.len() && data[skip] == b {
2439                    skip += 1;
2440                }
2441                cursor = skip;
2442            }
2443            None => {
2444                let remaining = data.len() - cursor;
2445                flush_and_copy!(&data[cursor..], remaining);
2446                break;
2447            }
2448        }
2449    }
2450    if wp > 0 {
2451        writer.write_all(&outbuf[..wp])?;
2452    }
2453    Ok(())
2454}
2455
2456fn squeeze_single_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2457    if data.is_empty() {
2458        return Ok(());
2459    }
2460
2461    if memchr::memmem::find(data, &[ch, ch]).is_none() {
2462        return writer.write_all(data);
2463    }
2464
2465    // Parallel path: squeeze each chunk, fix boundaries
2466    if data.len() >= PARALLEL_THRESHOLD {
2467        let n_threads = rayon::current_num_threads().max(1);
2468        let chunk_size = (data.len() / n_threads).max(32 * 1024);
2469
2470        let results: Vec<Vec<u8>> = data
2471            .par_chunks(chunk_size)
2472            .map(|chunk| {
2473                let mut out = Vec::with_capacity(chunk.len());
2474                let mut cursor = 0;
2475                while cursor < chunk.len() {
2476                    match memchr::memchr(ch, &chunk[cursor..]) {
2477                        Some(offset) => {
2478                            let pos = cursor + offset;
2479                            if pos > cursor {
2480                                out.extend_from_slice(&chunk[cursor..pos]);
2481                            }
2482                            out.push(ch);
2483                            cursor = pos + 1;
2484                            while cursor < chunk.len() && chunk[cursor] == ch {
2485                                cursor += 1;
2486                            }
2487                        }
2488                        None => {
2489                            out.extend_from_slice(&chunk[cursor..]);
2490                            break;
2491                        }
2492                    }
2493                }
2494                out
2495            })
2496            .collect();
2497
2498        // Build IoSlice list, fixing boundary squeezability.
2499        // Use writev to minimize syscalls.
2500        let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
2501        for (idx, result) in results.iter().enumerate() {
2502            if result.is_empty() {
2503                continue;
2504            }
2505            if idx > 0 {
2506                if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
2507                    if prev_last == ch {
2508                        // Skip leading ch bytes in this chunk result
2509                        let skip = result.iter().take_while(|&&b| b == ch).count();
2510                        if skip < result.len() {
2511                            slices.push(std::io::IoSlice::new(&result[skip..]));
2512                        }
2513                        continue;
2514                    }
2515                }
2516            }
2517            slices.push(std::io::IoSlice::new(result));
2518        }
2519        return write_ioslices(writer, &slices);
2520    }
2521
2522    let buf_size = data.len().min(BUF_SIZE);
2523    let mut outbuf = vec![0u8; buf_size];
2524    let len = data.len();
2525    let mut wp = 0;
2526    let mut cursor = 0;
2527
2528    while cursor < len {
2529        match memchr::memchr(ch, &data[cursor..]) {
2530            Some(offset) => {
2531                let pos = cursor + offset;
2532                let gap = pos - cursor;
2533                if gap > 0 {
2534                    if wp + gap > buf_size {
2535                        writer.write_all(&outbuf[..wp])?;
2536                        wp = 0;
2537                    }
2538                    if gap > buf_size {
2539                        writer.write_all(&data[cursor..pos])?;
2540                    } else {
2541                        outbuf[wp..wp + gap].copy_from_slice(&data[cursor..pos]);
2542                        wp += gap;
2543                    }
2544                }
2545                if wp >= buf_size {
2546                    writer.write_all(&outbuf[..wp])?;
2547                    wp = 0;
2548                }
2549                outbuf[wp] = ch;
2550                wp += 1;
2551                cursor = pos + 1;
2552                while cursor < len && data[cursor] == ch {
2553                    cursor += 1;
2554                }
2555            }
2556            None => {
2557                let remaining = len - cursor;
2558                if remaining > 0 {
2559                    if wp + remaining > buf_size {
2560                        writer.write_all(&outbuf[..wp])?;
2561                        wp = 0;
2562                    }
2563                    if remaining > buf_size {
2564                        writer.write_all(&data[cursor..])?;
2565                    } else {
2566                        outbuf[wp..wp + remaining].copy_from_slice(&data[cursor..]);
2567                        wp += remaining;
2568                    }
2569                }
2570                break;
2571            }
2572        }
2573    }
2574
2575    if wp > 0 {
2576        writer.write_all(&outbuf[..wp])?;
2577    }
2578    Ok(())
2579}