Skip to main content

coreutils_rs/tr/
core.rs

1use std::io::{self, Read, Write};
2
3use rayon::prelude::*;
4
5const BUF_SIZE: usize = 1024 * 1024; // 1MB — fits L2/L3 cache for locality
6
7/// Stream buffer: 256KB — process data immediately after each read().
8/// Stays in L2 cache, matches typical kernel pipe buffer size.
9/// Unlike fill_buf (which loops to accumulate 8MB = ~128 syscalls for pipes),
10/// we read once and process immediately, matching GNU tr's approach.
11const STREAM_BUF: usize = 256 * 1024;
12
13/// Minimum size for parallel translation.
14/// Lowered to 4MB since rayon overhead is small (~10μs) compared to
15/// translation time per chunk (~400μs at 10GB/s).
16const PARALLEL_TRANSLATE_THRESHOLD: usize = 4 * 1024 * 1024;
17
18/// Build a 256-byte lookup table mapping set1[i] -> set2[i].
19#[inline]
20fn build_translate_table(set1: &[u8], set2: &[u8]) -> [u8; 256] {
21    let mut table: [u8; 256] = std::array::from_fn(|i| i as u8);
22    let last = set2.last().copied();
23    for (i, &from) in set1.iter().enumerate() {
24        table[from as usize] = if i < set2.len() {
25            set2[i]
26        } else {
27            last.unwrap_or(from)
28        };
29    }
30    table
31}
32
33/// Build a 256-bit (32-byte) membership set for O(1) byte lookup.
34#[inline]
35fn build_member_set(chars: &[u8]) -> [u8; 32] {
36    let mut set = [0u8; 32];
37    for &ch in chars {
38        set[ch as usize >> 3] |= 1 << (ch & 7);
39    }
40    set
41}
42
43#[inline(always)]
44fn is_member(set: &[u8; 32], ch: u8) -> bool {
45    unsafe { (*set.get_unchecked(ch as usize >> 3) & (1 << (ch & 7))) != 0 }
46}
47
48// ============================================================================
49// Table analysis for SIMD fast paths
50// ============================================================================
51
52/// Classification of a translation table for SIMD optimization.
53#[allow(dead_code)] // Fields used only on x86_64 (AVX2 SIMD path)
54enum TranslateKind {
55    /// Table is identity — no translation needed.
56    Identity,
57    /// A contiguous range [lo, hi] with constant wrapping-add delta; identity elsewhere.
58    RangeDelta { lo: u8, hi: u8, delta: u8 },
59    /// Arbitrary translation — use general lookup table.
60    General,
61}
62
63/// Analyze a translation table to detect SIMD-optimizable patterns.
64fn analyze_table(table: &[u8; 256]) -> TranslateKind {
65    let mut first_delta: Option<u8> = None;
66    let mut lo: u8 = 0;
67    let mut hi: u8 = 0;
68    let mut count: u32 = 0;
69
70    for i in 0..256u16 {
71        let actual = table[i as usize];
72        if actual != i as u8 {
73            let delta = actual.wrapping_sub(i as u8);
74            count += 1;
75            match first_delta {
76                None => {
77                    first_delta = Some(delta);
78                    lo = i as u8;
79                    hi = i as u8;
80                }
81                Some(d) if d == delta => {
82                    hi = i as u8;
83                }
84                _ => return TranslateKind::General,
85            }
86        }
87    }
88
89    match (count, first_delta) {
90        (0, _) => TranslateKind::Identity,
91        (c, Some(delta)) if c == (hi as u32 - lo as u32 + 1) => {
92            TranslateKind::RangeDelta { lo, hi, delta }
93        }
94        _ => TranslateKind::General,
95    }
96}
97
98// ============================================================================
99// SIMD translation (x86_64 AVX2)
100// ============================================================================
101
102#[cfg(target_arch = "x86_64")]
103mod simd_tr {
104    /// Translate bytes in range [lo, hi] by adding `delta` (wrapping), leave others unchanged.
105    /// Processes 128 bytes per iteration (4x unroll) using AVX2.
106    ///
107    /// SAFETY: Caller must ensure AVX2 is available and out.len() >= data.len().
108    #[target_feature(enable = "avx2")]
109    pub unsafe fn range_delta(data: &[u8], out: &mut [u8], lo: u8, hi: u8, delta: u8) {
110        unsafe {
111            use std::arch::x86_64::*;
112
113            let lo_vec = _mm256_set1_epi8(lo as i8);
114            let range_vec = _mm256_set1_epi8((hi - lo) as i8);
115            let delta_vec = _mm256_set1_epi8(delta as i8);
116
117            let len = data.len();
118            let inp = data.as_ptr();
119            let outp = out.as_mut_ptr();
120            let mut i = 0usize;
121
122            // 4x unrolled: process 128 bytes per iteration for better ILP
123            while i + 128 <= len {
124                let v0 = _mm256_loadu_si256(inp.add(i) as *const __m256i);
125                let v1 = _mm256_loadu_si256(inp.add(i + 32) as *const __m256i);
126                let v2 = _mm256_loadu_si256(inp.add(i + 64) as *const __m256i);
127                let v3 = _mm256_loadu_si256(inp.add(i + 96) as *const __m256i);
128
129                let d0 = _mm256_sub_epi8(v0, lo_vec);
130                let d1 = _mm256_sub_epi8(v1, lo_vec);
131                let d2 = _mm256_sub_epi8(v2, lo_vec);
132                let d3 = _mm256_sub_epi8(v3, lo_vec);
133
134                let m0 = _mm256_cmpeq_epi8(_mm256_min_epu8(d0, range_vec), d0);
135                let m1 = _mm256_cmpeq_epi8(_mm256_min_epu8(d1, range_vec), d1);
136                let m2 = _mm256_cmpeq_epi8(_mm256_min_epu8(d2, range_vec), d2);
137                let m3 = _mm256_cmpeq_epi8(_mm256_min_epu8(d3, range_vec), d3);
138
139                let r0 = _mm256_add_epi8(v0, _mm256_and_si256(m0, delta_vec));
140                let r1 = _mm256_add_epi8(v1, _mm256_and_si256(m1, delta_vec));
141                let r2 = _mm256_add_epi8(v2, _mm256_and_si256(m2, delta_vec));
142                let r3 = _mm256_add_epi8(v3, _mm256_and_si256(m3, delta_vec));
143
144                _mm256_storeu_si256(outp.add(i) as *mut __m256i, r0);
145                _mm256_storeu_si256(outp.add(i + 32) as *mut __m256i, r1);
146                _mm256_storeu_si256(outp.add(i + 64) as *mut __m256i, r2);
147                _mm256_storeu_si256(outp.add(i + 96) as *mut __m256i, r3);
148                i += 128;
149            }
150
151            while i + 32 <= len {
152                let v = _mm256_loadu_si256(inp.add(i) as *const __m256i);
153                let diff = _mm256_sub_epi8(v, lo_vec);
154                let mask = _mm256_cmpeq_epi8(_mm256_min_epu8(diff, range_vec), diff);
155                let result = _mm256_add_epi8(v, _mm256_and_si256(mask, delta_vec));
156                _mm256_storeu_si256(outp.add(i) as *mut __m256i, result);
157                i += 32;
158            }
159
160            while i < len {
161                let b = *inp.add(i);
162                *outp.add(i) = if b.wrapping_sub(lo) <= (hi - lo) {
163                    b.wrapping_add(delta)
164                } else {
165                    b
166                };
167                i += 1;
168            }
169        }
170    }
171
172    /// General 256-byte table lookup using 16-way vpshufb (nibble decomposition).
173    /// Splits each input byte into high nibble (selects one of 16 shuffle tables)
174    /// and low nibble (index within the shuffle table). Processes 64 bytes per
175    /// iteration (2x unrolled) for instruction-level parallelism.
176    ///
177    /// SAFETY: Caller must ensure AVX2 is available and out.len() >= data.len().
178    #[target_feature(enable = "avx2")]
179    pub unsafe fn general_lookup(data: &[u8], out: &mut [u8], table: &[u8; 256]) {
180        unsafe {
181            use std::arch::x86_64::*;
182
183            // Build 16 vpshufb LUTs from the 256-byte translation table.
184            // LUT[h] covers table[h*16..h*16+16], broadcast to both 128-bit lanes.
185            let tp = table.as_ptr();
186            let mut luts = [_mm256_setzero_si256(); 16];
187            let mut h = 0;
188            while h < 16 {
189                luts[h] =
190                    _mm256_broadcastsi128_si256(_mm_loadu_si128(tp.add(h * 16) as *const __m128i));
191                h += 1;
192            }
193
194            let lo_mask = _mm256_set1_epi8(0x0F);
195            let len = data.len();
196            let inp = data.as_ptr();
197            let outp = out.as_mut_ptr();
198            let mut i = 0;
199
200            // 2x unrolled: process 64 bytes per iteration
201            while i + 64 <= len {
202                let v0 = _mm256_loadu_si256(inp.add(i) as *const __m256i);
203                let v1 = _mm256_loadu_si256(inp.add(i + 32) as *const __m256i);
204
205                let lo0 = _mm256_and_si256(v0, lo_mask);
206                let lo1 = _mm256_and_si256(v1, lo_mask);
207                let hi0 = _mm256_and_si256(_mm256_srli_epi16(v0, 4), lo_mask);
208                let hi1 = _mm256_and_si256(_mm256_srli_epi16(v1, 4), lo_mask);
209
210                let mut r0 = _mm256_setzero_si256();
211                let mut r1 = _mm256_setzero_si256();
212
213                // Process all 16 high-nibble values.
214                // For each h, bytes where high_nibble == h get their result from luts[h].
215                macro_rules! do_nib {
216                    ($h:literal) => {
217                        let hv = _mm256_set1_epi8($h);
218                        let lut = luts[$h as usize];
219                        let m0 = _mm256_cmpeq_epi8(hi0, hv);
220                        let m1 = _mm256_cmpeq_epi8(hi1, hv);
221                        r0 = _mm256_or_si256(
222                            r0,
223                            _mm256_and_si256(_mm256_shuffle_epi8(lut, lo0), m0),
224                        );
225                        r1 = _mm256_or_si256(
226                            r1,
227                            _mm256_and_si256(_mm256_shuffle_epi8(lut, lo1), m1),
228                        );
229                    };
230                }
231
232                do_nib!(0);
233                do_nib!(1);
234                do_nib!(2);
235                do_nib!(3);
236                do_nib!(4);
237                do_nib!(5);
238                do_nib!(6);
239                do_nib!(7);
240                do_nib!(8);
241                do_nib!(9);
242                do_nib!(10);
243                do_nib!(11);
244                do_nib!(12);
245                do_nib!(13);
246                do_nib!(14);
247                do_nib!(15);
248
249                _mm256_storeu_si256(outp.add(i) as *mut __m256i, r0);
250                _mm256_storeu_si256(outp.add(i + 32) as *mut __m256i, r1);
251                i += 64;
252            }
253
254            // Single vector tail
255            while i + 32 <= len {
256                let v = _mm256_loadu_si256(inp.add(i) as *const __m256i);
257                let lo = _mm256_and_si256(v, lo_mask);
258                let hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), lo_mask);
259
260                let mut result = _mm256_setzero_si256();
261                let mut hh = 0u8;
262                while hh < 16 {
263                    let hv = _mm256_set1_epi8(hh as i8);
264                    let m = _mm256_cmpeq_epi8(hi, hv);
265                    result = _mm256_or_si256(
266                        result,
267                        _mm256_and_si256(_mm256_shuffle_epi8(luts[hh as usize], lo), m),
268                    );
269                    hh += 1;
270                }
271
272                _mm256_storeu_si256(outp.add(i) as *mut __m256i, result);
273                i += 32;
274            }
275
276            // Scalar tail
277            while i < len {
278                *outp.add(i) = *table.get_unchecked(*inp.add(i) as usize);
279                i += 1;
280            }
281        }
282    }
283
284    /// In-place general 256-byte table lookup using 16-way vpshufb.
285    ///
286    /// SAFETY: Caller must ensure AVX2 is available.
287    #[target_feature(enable = "avx2")]
288    pub unsafe fn general_lookup_inplace(data: &mut [u8], table: &[u8; 256]) {
289        unsafe {
290            use std::arch::x86_64::*;
291
292            let tp = table.as_ptr();
293            let mut luts = [_mm256_setzero_si256(); 16];
294            let mut h = 0;
295            while h < 16 {
296                luts[h] =
297                    _mm256_broadcastsi128_si256(_mm_loadu_si128(tp.add(h * 16) as *const __m128i));
298                h += 1;
299            }
300
301            let lo_mask = _mm256_set1_epi8(0x0F);
302            let len = data.len();
303            let ptr = data.as_mut_ptr();
304            let mut i = 0;
305
306            while i + 64 <= len {
307                let v0 = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
308                let v1 = _mm256_loadu_si256(ptr.add(i + 32) as *const __m256i);
309
310                let lo0 = _mm256_and_si256(v0, lo_mask);
311                let lo1 = _mm256_and_si256(v1, lo_mask);
312                let hi0 = _mm256_and_si256(_mm256_srli_epi16(v0, 4), lo_mask);
313                let hi1 = _mm256_and_si256(_mm256_srli_epi16(v1, 4), lo_mask);
314
315                let mut r0 = _mm256_setzero_si256();
316                let mut r1 = _mm256_setzero_si256();
317
318                macro_rules! do_nib {
319                    ($h:literal) => {
320                        let hv = _mm256_set1_epi8($h);
321                        let lut = luts[$h as usize];
322                        let m0 = _mm256_cmpeq_epi8(hi0, hv);
323                        let m1 = _mm256_cmpeq_epi8(hi1, hv);
324                        r0 = _mm256_or_si256(
325                            r0,
326                            _mm256_and_si256(_mm256_shuffle_epi8(lut, lo0), m0),
327                        );
328                        r1 = _mm256_or_si256(
329                            r1,
330                            _mm256_and_si256(_mm256_shuffle_epi8(lut, lo1), m1),
331                        );
332                    };
333                }
334
335                do_nib!(0);
336                do_nib!(1);
337                do_nib!(2);
338                do_nib!(3);
339                do_nib!(4);
340                do_nib!(5);
341                do_nib!(6);
342                do_nib!(7);
343                do_nib!(8);
344                do_nib!(9);
345                do_nib!(10);
346                do_nib!(11);
347                do_nib!(12);
348                do_nib!(13);
349                do_nib!(14);
350                do_nib!(15);
351
352                _mm256_storeu_si256(ptr.add(i) as *mut __m256i, r0);
353                _mm256_storeu_si256(ptr.add(i + 32) as *mut __m256i, r1);
354                i += 64;
355            }
356
357            while i + 32 <= len {
358                let v = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
359                let lo = _mm256_and_si256(v, lo_mask);
360                let hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), lo_mask);
361
362                let mut result = _mm256_setzero_si256();
363                let mut hh = 0u8;
364                while hh < 16 {
365                    let hv = _mm256_set1_epi8(hh as i8);
366                    let m = _mm256_cmpeq_epi8(hi, hv);
367                    result = _mm256_or_si256(
368                        result,
369                        _mm256_and_si256(_mm256_shuffle_epi8(luts[hh as usize], lo), m),
370                    );
371                    hh += 1;
372                }
373
374                _mm256_storeu_si256(ptr.add(i) as *mut __m256i, result);
375                i += 32;
376            }
377
378            while i < len {
379                let b = *ptr.add(i);
380                *ptr.add(i) = *table.get_unchecked(b as usize);
381                i += 1;
382            }
383        }
384    }
385
386    /// In-place SIMD translate for stdin path.
387    ///
388    /// SAFETY: Caller must ensure AVX2 is available.
389    #[target_feature(enable = "avx2")]
390    pub unsafe fn range_delta_inplace(data: &mut [u8], lo: u8, hi: u8, delta: u8) {
391        unsafe {
392            use std::arch::x86_64::*;
393
394            let lo_vec = _mm256_set1_epi8(lo as i8);
395            let range_vec = _mm256_set1_epi8((hi - lo) as i8);
396            let delta_vec = _mm256_set1_epi8(delta as i8);
397
398            let len = data.len();
399            let ptr = data.as_mut_ptr();
400            let mut i = 0usize;
401
402            while i + 128 <= len {
403                let v0 = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
404                let v1 = _mm256_loadu_si256(ptr.add(i + 32) as *const __m256i);
405                let v2 = _mm256_loadu_si256(ptr.add(i + 64) as *const __m256i);
406                let v3 = _mm256_loadu_si256(ptr.add(i + 96) as *const __m256i);
407
408                let d0 = _mm256_sub_epi8(v0, lo_vec);
409                let d1 = _mm256_sub_epi8(v1, lo_vec);
410                let d2 = _mm256_sub_epi8(v2, lo_vec);
411                let d3 = _mm256_sub_epi8(v3, lo_vec);
412
413                let m0 = _mm256_cmpeq_epi8(_mm256_min_epu8(d0, range_vec), d0);
414                let m1 = _mm256_cmpeq_epi8(_mm256_min_epu8(d1, range_vec), d1);
415                let m2 = _mm256_cmpeq_epi8(_mm256_min_epu8(d2, range_vec), d2);
416                let m3 = _mm256_cmpeq_epi8(_mm256_min_epu8(d3, range_vec), d3);
417
418                let r0 = _mm256_add_epi8(v0, _mm256_and_si256(m0, delta_vec));
419                let r1 = _mm256_add_epi8(v1, _mm256_and_si256(m1, delta_vec));
420                let r2 = _mm256_add_epi8(v2, _mm256_and_si256(m2, delta_vec));
421                let r3 = _mm256_add_epi8(v3, _mm256_and_si256(m3, delta_vec));
422
423                _mm256_storeu_si256(ptr.add(i) as *mut __m256i, r0);
424                _mm256_storeu_si256(ptr.add(i + 32) as *mut __m256i, r1);
425                _mm256_storeu_si256(ptr.add(i + 64) as *mut __m256i, r2);
426                _mm256_storeu_si256(ptr.add(i + 96) as *mut __m256i, r3);
427                i += 128;
428            }
429
430            while i + 32 <= len {
431                let v = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
432                let diff = _mm256_sub_epi8(v, lo_vec);
433                let mask = _mm256_cmpeq_epi8(_mm256_min_epu8(diff, range_vec), diff);
434                let result = _mm256_add_epi8(v, _mm256_and_si256(mask, delta_vec));
435                _mm256_storeu_si256(ptr.add(i) as *mut __m256i, result);
436                i += 32;
437            }
438
439            while i < len {
440                let b = *ptr.add(i);
441                *ptr.add(i) = if b.wrapping_sub(lo) <= (hi - lo) {
442                    b.wrapping_add(delta)
443                } else {
444                    b
445                };
446                i += 1;
447            }
448        }
449    }
450}
451
452/// AVX2 nibble-based set membership classifier.
453/// Uses vpshufb to test 32 bytes at a time for membership in a byte set.
454/// Returns a `__m256i` where each byte is 0xFF if the byte is NOT in the set, 0x00 if it IS.
455/// Check if AVX2 is available at runtime.
456#[cfg(target_arch = "x86_64")]
457#[inline(always)]
458fn has_avx2() -> bool {
459    is_x86_feature_detected!("avx2")
460}
461
462/// Translate a chunk of bytes using a lookup table — unrolled 8-byte inner loop.
463#[inline(always)]
464fn translate_chunk(chunk: &[u8], out: &mut [u8], table: &[u8; 256]) {
465    let len = chunk.len();
466    let mut i = 0;
467    while i + 8 <= len {
468        unsafe {
469            *out.get_unchecked_mut(i) = *table.get_unchecked(*chunk.get_unchecked(i) as usize);
470            *out.get_unchecked_mut(i + 1) =
471                *table.get_unchecked(*chunk.get_unchecked(i + 1) as usize);
472            *out.get_unchecked_mut(i + 2) =
473                *table.get_unchecked(*chunk.get_unchecked(i + 2) as usize);
474            *out.get_unchecked_mut(i + 3) =
475                *table.get_unchecked(*chunk.get_unchecked(i + 3) as usize);
476            *out.get_unchecked_mut(i + 4) =
477                *table.get_unchecked(*chunk.get_unchecked(i + 4) as usize);
478            *out.get_unchecked_mut(i + 5) =
479                *table.get_unchecked(*chunk.get_unchecked(i + 5) as usize);
480            *out.get_unchecked_mut(i + 6) =
481                *table.get_unchecked(*chunk.get_unchecked(i + 6) as usize);
482            *out.get_unchecked_mut(i + 7) =
483                *table.get_unchecked(*chunk.get_unchecked(i + 7) as usize);
484        }
485        i += 8;
486    }
487    while i < len {
488        unsafe {
489            *out.get_unchecked_mut(i) = *table.get_unchecked(*chunk.get_unchecked(i) as usize);
490        }
491        i += 1;
492    }
493}
494
495/// In-place translate for stdin path — avoids separate output buffer.
496#[inline(always)]
497fn translate_inplace(data: &mut [u8], table: &[u8; 256]) {
498    let len = data.len();
499    let ptr = data.as_mut_ptr();
500    let tab = table.as_ptr();
501
502    unsafe {
503        let mut i = 0;
504        while i + 8 <= len {
505            *ptr.add(i) = *tab.add(*ptr.add(i) as usize);
506            *ptr.add(i + 1) = *tab.add(*ptr.add(i + 1) as usize);
507            *ptr.add(i + 2) = *tab.add(*ptr.add(i + 2) as usize);
508            *ptr.add(i + 3) = *tab.add(*ptr.add(i + 3) as usize);
509            *ptr.add(i + 4) = *tab.add(*ptr.add(i + 4) as usize);
510            *ptr.add(i + 5) = *tab.add(*ptr.add(i + 5) as usize);
511            *ptr.add(i + 6) = *tab.add(*ptr.add(i + 6) as usize);
512            *ptr.add(i + 7) = *tab.add(*ptr.add(i + 7) as usize);
513            i += 8;
514        }
515        while i < len {
516            *ptr.add(i) = *tab.add(*ptr.add(i) as usize);
517            i += 1;
518        }
519    }
520}
521
522// ============================================================================
523// Dispatch: choose SIMD or scalar based on table analysis
524// ============================================================================
525
526/// Translate a chunk using the best available method.
527/// `use_simd` is cached at call site to avoid per-chunk atomic loads.
528#[inline]
529fn translate_chunk_dispatch(
530    chunk: &[u8],
531    out: &mut [u8],
532    table: &[u8; 256],
533    kind: &TranslateKind,
534    _use_simd: bool,
535) {
536    match kind {
537        TranslateKind::Identity => {
538            out[..chunk.len()].copy_from_slice(chunk);
539        }
540        #[cfg(target_arch = "x86_64")]
541        TranslateKind::RangeDelta { lo, hi, delta } => {
542            if _use_simd {
543                unsafe { simd_tr::range_delta(chunk, out, *lo, *hi, *delta) };
544                return;
545            }
546            translate_chunk(chunk, out, table);
547        }
548        #[cfg(not(target_arch = "x86_64"))]
549        TranslateKind::RangeDelta { .. } => {
550            translate_chunk(chunk, out, table);
551        }
552        #[cfg(target_arch = "x86_64")]
553        TranslateKind::General => {
554            if _use_simd {
555                unsafe { simd_tr::general_lookup(chunk, out, table) };
556                return;
557            }
558            translate_chunk(chunk, out, table);
559        }
560        #[cfg(not(target_arch = "x86_64"))]
561        TranslateKind::General => {
562            translate_chunk(chunk, out, table);
563        }
564    }
565}
566
567/// In-place translate dispatch.
568/// `use_simd` is cached at call site to avoid per-chunk atomic loads.
569#[inline]
570fn translate_inplace_dispatch(
571    data: &mut [u8],
572    table: &[u8; 256],
573    kind: &TranslateKind,
574    _use_simd: bool,
575) {
576    match kind {
577        TranslateKind::Identity => {}
578        #[cfg(target_arch = "x86_64")]
579        TranslateKind::RangeDelta { lo, hi, delta } => {
580            if _use_simd {
581                unsafe { simd_tr::range_delta_inplace(data, *lo, *hi, *delta) };
582                return;
583            }
584            translate_inplace(data, table);
585        }
586        #[cfg(not(target_arch = "x86_64"))]
587        TranslateKind::RangeDelta { .. } => {
588            translate_inplace(data, table);
589        }
590        #[cfg(target_arch = "x86_64")]
591        TranslateKind::General => {
592            if _use_simd {
593                unsafe { simd_tr::general_lookup_inplace(data, table) };
594                return;
595            }
596            translate_inplace(data, table);
597        }
598        #[cfg(not(target_arch = "x86_64"))]
599        TranslateKind::General => {
600            translate_inplace(data, table);
601        }
602    }
603}
604
605// ============================================================================
606// Streaming functions (Read + Write)
607// Process data immediately after each read() — no fill_buf accumulation.
608// Uses 256KB buffer (L2-friendly) instead of 8MB.
609// ============================================================================
610
611pub fn translate(
612    set1: &[u8],
613    set2: &[u8],
614    reader: &mut impl Read,
615    writer: &mut impl Write,
616) -> io::Result<()> {
617    let table = build_translate_table(set1, set2);
618    let kind = analyze_table(&table);
619    #[cfg(target_arch = "x86_64")]
620    let use_simd = has_avx2();
621    #[cfg(not(target_arch = "x86_64"))]
622    let use_simd = false;
623
624    // Use 1MB buffer for fewer read() syscalls on pipes
625    let mut buf = vec![0u8; BUF_SIZE];
626    loop {
627        let n = match reader.read(&mut buf) {
628            Ok(0) => break,
629            Ok(n) => n,
630            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
631            Err(e) => return Err(e),
632        };
633        translate_inplace_dispatch(&mut buf[..n], &table, &kind, use_simd);
634        writer.write_all(&buf[..n])?;
635    }
636    Ok(())
637}
638
639pub fn translate_squeeze(
640    set1: &[u8],
641    set2: &[u8],
642    reader: &mut impl Read,
643    writer: &mut impl Write,
644) -> io::Result<()> {
645    let table = build_translate_table(set1, set2);
646    let squeeze_set = build_member_set(set2);
647    let kind = analyze_table(&table);
648    #[cfg(target_arch = "x86_64")]
649    let use_simd = has_avx2();
650    #[cfg(not(target_arch = "x86_64"))]
651    let use_simd = false;
652
653    // Single buffer: SIMD translate in-place, then squeeze in-place compaction.
654    // Eliminates outbuf allocation and saves one full memcpy of data.
655    let mut buf = vec![0u8; STREAM_BUF];
656    let mut last_squeezed: u16 = 256;
657
658    loop {
659        let n = match reader.read(&mut buf) {
660            Ok(0) => break,
661            Ok(n) => n,
662            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
663            Err(e) => return Err(e),
664        };
665        // Phase 1: SIMD translate in-place (uses AVX2 when available)
666        translate_inplace_dispatch(&mut buf[..n], &table, &kind, use_simd);
667        // Phase 2: squeeze in-place compaction (wp <= i always, safe)
668        let mut wp = 0;
669        unsafe {
670            let ptr = buf.as_mut_ptr();
671            for i in 0..n {
672                let b = *ptr.add(i);
673                if is_member(&squeeze_set, b) {
674                    if last_squeezed == b as u16 {
675                        continue;
676                    }
677                    last_squeezed = b as u16;
678                } else {
679                    last_squeezed = 256;
680                }
681                *ptr.add(wp) = b;
682                wp += 1;
683            }
684        }
685        writer.write_all(&buf[..wp])?;
686    }
687    Ok(())
688}
689
690pub fn delete(
691    delete_chars: &[u8],
692    reader: &mut impl Read,
693    writer: &mut impl Write,
694) -> io::Result<()> {
695    // Fast path: single character delete using SIMD memchr
696    if delete_chars.len() == 1 {
697        return delete_single_streaming(delete_chars[0], reader, writer);
698    }
699
700    // Fast paths: 2-3 char delete using SIMD memchr2/memchr3
701    if delete_chars.len() <= 3 {
702        return delete_multi_streaming(delete_chars, reader, writer);
703    }
704
705    let member = build_member_set(delete_chars);
706    // Single buffer with in-place compaction — eliminates outbuf allocation + memcpy
707    let mut buf = vec![0u8; STREAM_BUF];
708
709    loop {
710        let n = match reader.read(&mut buf) {
711            Ok(0) => break,
712            Ok(n) => n,
713            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
714            Err(e) => return Err(e),
715        };
716        let mut wp = 0;
717        unsafe {
718            let ptr = buf.as_mut_ptr();
719            let mut i = 0;
720            // 8-byte unrolled in-place compaction
721            while i + 8 <= n {
722                let b0 = *ptr.add(i);
723                let b1 = *ptr.add(i + 1);
724                let b2 = *ptr.add(i + 2);
725                let b3 = *ptr.add(i + 3);
726                let b4 = *ptr.add(i + 4);
727                let b5 = *ptr.add(i + 5);
728                let b6 = *ptr.add(i + 6);
729                let b7 = *ptr.add(i + 7);
730
731                if !is_member(&member, b0) {
732                    *ptr.add(wp) = b0;
733                    wp += 1;
734                }
735                if !is_member(&member, b1) {
736                    *ptr.add(wp) = b1;
737                    wp += 1;
738                }
739                if !is_member(&member, b2) {
740                    *ptr.add(wp) = b2;
741                    wp += 1;
742                }
743                if !is_member(&member, b3) {
744                    *ptr.add(wp) = b3;
745                    wp += 1;
746                }
747                if !is_member(&member, b4) {
748                    *ptr.add(wp) = b4;
749                    wp += 1;
750                }
751                if !is_member(&member, b5) {
752                    *ptr.add(wp) = b5;
753                    wp += 1;
754                }
755                if !is_member(&member, b6) {
756                    *ptr.add(wp) = b6;
757                    wp += 1;
758                }
759                if !is_member(&member, b7) {
760                    *ptr.add(wp) = b7;
761                    wp += 1;
762                }
763                i += 8;
764            }
765            while i < n {
766                let b = *ptr.add(i);
767                if !is_member(&member, b) {
768                    *ptr.add(wp) = b;
769                    wp += 1;
770                }
771                i += 1;
772            }
773        }
774        writer.write_all(&buf[..wp])?;
775    }
776    Ok(())
777}
778
779/// Single-character delete from a reader — in-place compaction with SIMD memchr.
780/// Uses memchr for SIMD scanning + ptr::copy for in-place shift + single write_all.
781fn delete_single_streaming(
782    ch: u8,
783    reader: &mut impl Read,
784    writer: &mut impl Write,
785) -> io::Result<()> {
786    let mut buf = vec![0u8; STREAM_BUF];
787    loop {
788        let n = match reader.read(&mut buf) {
789            Ok(0) => break,
790            Ok(n) => n,
791            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
792            Err(e) => return Err(e),
793        };
794        let mut wp = 0;
795        let mut i = 0;
796        while i < n {
797            match memchr::memchr(ch, &buf[i..n]) {
798                Some(offset) => {
799                    if offset > 0 {
800                        if wp != i {
801                            unsafe {
802                                std::ptr::copy(
803                                    buf.as_ptr().add(i),
804                                    buf.as_mut_ptr().add(wp),
805                                    offset,
806                                );
807                            }
808                        }
809                        wp += offset;
810                    }
811                    i += offset + 1; // skip the deleted char
812                }
813                None => {
814                    let run_len = n - i;
815                    if run_len > 0 {
816                        if wp != i {
817                            unsafe {
818                                std::ptr::copy(
819                                    buf.as_ptr().add(i),
820                                    buf.as_mut_ptr().add(wp),
821                                    run_len,
822                                );
823                            }
824                        }
825                        wp += run_len;
826                    }
827                    break;
828                }
829            }
830        }
831        writer.write_all(&buf[..wp])?;
832    }
833    Ok(())
834}
835
836/// Multi-character delete (2-3 chars) — in-place compaction with SIMD memchr2/memchr3.
837fn delete_multi_streaming(
838    chars: &[u8],
839    reader: &mut impl Read,
840    writer: &mut impl Write,
841) -> io::Result<()> {
842    let mut buf = vec![0u8; STREAM_BUF];
843    loop {
844        let n = match reader.read(&mut buf) {
845            Ok(0) => break,
846            Ok(n) => n,
847            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
848            Err(e) => return Err(e),
849        };
850        let mut wp = 0;
851        let mut i = 0;
852        while i < n {
853            let found = if chars.len() == 2 {
854                memchr::memchr2(chars[0], chars[1], &buf[i..n])
855            } else {
856                memchr::memchr3(chars[0], chars[1], chars[2], &buf[i..n])
857            };
858            match found {
859                Some(offset) => {
860                    if offset > 0 {
861                        if wp != i {
862                            unsafe {
863                                std::ptr::copy(
864                                    buf.as_ptr().add(i),
865                                    buf.as_mut_ptr().add(wp),
866                                    offset,
867                                );
868                            }
869                        }
870                        wp += offset;
871                    }
872                    i += offset + 1;
873                }
874                None => {
875                    let run_len = n - i;
876                    if run_len > 0 {
877                        if wp != i {
878                            unsafe {
879                                std::ptr::copy(
880                                    buf.as_ptr().add(i),
881                                    buf.as_mut_ptr().add(wp),
882                                    run_len,
883                                );
884                            }
885                        }
886                        wp += run_len;
887                    }
888                    break;
889                }
890            }
891        }
892        writer.write_all(&buf[..wp])?;
893    }
894    Ok(())
895}
896
897pub fn delete_squeeze(
898    delete_chars: &[u8],
899    squeeze_chars: &[u8],
900    reader: &mut impl Read,
901    writer: &mut impl Write,
902) -> io::Result<()> {
903    let delete_set = build_member_set(delete_chars);
904    let squeeze_set = build_member_set(squeeze_chars);
905    // Single buffer with in-place compaction
906    let mut buf = vec![0u8; STREAM_BUF];
907    let mut last_squeezed: u16 = 256;
908
909    loop {
910        let n = match reader.read(&mut buf) {
911            Ok(0) => break,
912            Ok(n) => n,
913            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
914            Err(e) => return Err(e),
915        };
916        let mut wp = 0;
917        unsafe {
918            let ptr = buf.as_mut_ptr();
919            for i in 0..n {
920                let b = *ptr.add(i);
921                if is_member(&delete_set, b) {
922                    continue;
923                }
924                if is_member(&squeeze_set, b) {
925                    if last_squeezed == b as u16 {
926                        continue;
927                    }
928                    last_squeezed = b as u16;
929                } else {
930                    last_squeezed = 256;
931                }
932                *ptr.add(wp) = b;
933                wp += 1;
934            }
935        }
936        writer.write_all(&buf[..wp])?;
937    }
938    Ok(())
939}
940
941pub fn squeeze(
942    squeeze_chars: &[u8],
943    reader: &mut impl Read,
944    writer: &mut impl Write,
945) -> io::Result<()> {
946    // Fast path: single squeeze char — bulk copy non-match runs
947    if squeeze_chars.len() == 1 {
948        return squeeze_single_stream(squeeze_chars[0], reader, writer);
949    }
950
951    let member = build_member_set(squeeze_chars);
952    // Single buffer with in-place compaction — eliminates outbuf allocation + memcpy
953    let mut buf = vec![0u8; STREAM_BUF];
954    let mut last_squeezed: u16 = 256;
955
956    loop {
957        let n = match reader.read(&mut buf) {
958            Ok(0) => break,
959            Ok(n) => n,
960            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
961            Err(e) => return Err(e),
962        };
963        let mut wp = 0;
964        unsafe {
965            let ptr = buf.as_mut_ptr();
966            for i in 0..n {
967                let b = *ptr.add(i);
968                if is_member(&member, b) {
969                    if last_squeezed == b as u16 {
970                        continue;
971                    }
972                    last_squeezed = b as u16;
973                } else {
974                    last_squeezed = 256;
975                }
976                *ptr.add(wp) = b;
977                wp += 1;
978            }
979        }
980        writer.write_all(&buf[..wp])?;
981    }
982    Ok(())
983}
984
985/// Squeeze a single character from a stream — in-place compaction with SIMD memchr.
986/// Single buffer: eliminates outbuf allocation and saves one full memcpy.
987/// Uses memchr for fast SIMD scanning, ptr::copy for in-place shift.
988fn squeeze_single_stream(
989    ch: u8,
990    reader: &mut impl Read,
991    writer: &mut impl Write,
992) -> io::Result<()> {
993    let mut buf = vec![0u8; STREAM_BUF];
994    let mut was_squeeze_char = false;
995
996    loop {
997        let n = match reader.read(&mut buf) {
998            Ok(0) => break,
999            Ok(n) => n,
1000            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
1001            Err(e) => return Err(e),
1002        };
1003
1004        // In-place squeeze compaction using memchr for SIMD scanning.
1005        // wp tracks write position (always <= read position i), so in-place is safe.
1006        let mut wp = 0;
1007        let mut i = 0;
1008
1009        while i < n {
1010            // Cross-chunk continuation: skip squeeze chars from previous chunk
1011            if was_squeeze_char && buf[i] == ch {
1012                i += 1;
1013                while i < n && buf[i] == ch {
1014                    i += 1;
1015                }
1016                // was_squeeze_char stays true until we see a non-squeeze char
1017                if i >= n {
1018                    break;
1019                }
1020            }
1021
1022            // Find next occurrence of squeeze char using SIMD memchr
1023            match memchr::memchr(ch, &buf[i..n]) {
1024                Some(offset) => {
1025                    let run_len = offset;
1026                    // Shift non-squeeze run left in-place (skip if already in position)
1027                    if run_len > 0 {
1028                        if wp != i {
1029                            unsafe {
1030                                std::ptr::copy(
1031                                    buf.as_ptr().add(i),
1032                                    buf.as_mut_ptr().add(wp),
1033                                    run_len,
1034                                );
1035                            }
1036                        }
1037                        wp += run_len;
1038                    }
1039                    i += run_len;
1040
1041                    // Emit one squeeze char, skip consecutive duplicates
1042                    unsafe {
1043                        *buf.as_mut_ptr().add(wp) = ch;
1044                    }
1045                    wp += 1;
1046                    was_squeeze_char = true;
1047                    i += 1;
1048                    while i < n && buf[i] == ch {
1049                        i += 1;
1050                    }
1051                }
1052                None => {
1053                    // No more squeeze chars — shift remaining data
1054                    let run_len = n - i;
1055                    if run_len > 0 {
1056                        if wp != i {
1057                            unsafe {
1058                                std::ptr::copy(
1059                                    buf.as_ptr().add(i),
1060                                    buf.as_mut_ptr().add(wp),
1061                                    run_len,
1062                                );
1063                            }
1064                        }
1065                        wp += run_len;
1066                    }
1067                    was_squeeze_char = false;
1068                    break;
1069                }
1070            }
1071        }
1072
1073        writer.write_all(&buf[..wp])?;
1074    }
1075    Ok(())
1076}
1077
1078// ============================================================================
1079// Mmap-based functions (zero-copy input from byte slice)
1080// ============================================================================
1081
1082/// Translate bytes from an mmap'd byte slice — zero syscall reads.
1083/// Uses SIMD AVX2 for range-delta patterns (e.g., a-z → A-Z).
1084/// For large inputs, translates in parallel using rayon for maximum throughput.
1085pub fn translate_mmap(
1086    set1: &[u8],
1087    set2: &[u8],
1088    data: &[u8],
1089    writer: &mut impl Write,
1090) -> io::Result<()> {
1091    let table = build_translate_table(set1, set2);
1092    let kind = analyze_table(&table);
1093    #[cfg(target_arch = "x86_64")]
1094    let use_simd = has_avx2();
1095    #[cfg(not(target_arch = "x86_64"))]
1096    let use_simd = false;
1097
1098    if matches!(kind, TranslateKind::Identity) {
1099        return writer.write_all(data);
1100    }
1101
1102    // Parallel translation for very large inputs — single shared output buffer
1103    if data.len() >= PARALLEL_TRANSLATE_THRESHOLD {
1104        let mut out = vec![0u8; data.len()];
1105        let num_threads = rayon::current_num_threads().max(1);
1106        let chunk_size = (data.len() + num_threads - 1) / num_threads;
1107        // Align to BUF_SIZE boundaries for cache efficiency
1108        let chunk_size = ((chunk_size + BUF_SIZE - 1) / BUF_SIZE) * BUF_SIZE;
1109
1110        // Translate in parallel into shared buffer — no per-chunk allocation
1111        data.par_chunks(chunk_size)
1112            .zip(out.par_chunks_mut(chunk_size))
1113            .for_each(|(inp, outp)| {
1114                translate_chunk_dispatch(inp, &mut outp[..inp.len()], &table, &kind, use_simd);
1115            });
1116
1117        // Single write of entire result
1118        writer.write_all(&out)?;
1119        return Ok(());
1120    }
1121
1122    // Single-allocation path: translate entire data into one buffer, single write
1123    if data.len() <= BUF_SIZE {
1124        let mut out = vec![0u8; data.len()];
1125        translate_chunk_dispatch(data, &mut out, &table, &kind, use_simd);
1126        writer.write_all(&out)?;
1127        return Ok(());
1128    }
1129
1130    // Chunked path for larger data — reuses buffer across chunks
1131    let mut out = vec![0u8; BUF_SIZE];
1132    for chunk in data.chunks(BUF_SIZE) {
1133        translate_chunk_dispatch(chunk, &mut out[..chunk.len()], &table, &kind, use_simd);
1134        writer.write_all(&out[..chunk.len()])?;
1135    }
1136    Ok(())
1137}
1138
1139/// Translate + squeeze from mmap'd byte slice.
1140/// Single buffer: translate into buffer, then squeeze in-place (wp <= i always holds).
1141/// Eliminates second buffer allocation and reduces memory traffic.
1142pub fn translate_squeeze_mmap(
1143    set1: &[u8],
1144    set2: &[u8],
1145    data: &[u8],
1146    writer: &mut impl Write,
1147) -> io::Result<()> {
1148    let table = build_translate_table(set1, set2);
1149    let squeeze_set = build_member_set(set2);
1150    let kind = analyze_table(&table);
1151    #[cfg(target_arch = "x86_64")]
1152    let use_simd = has_avx2();
1153    #[cfg(not(target_arch = "x86_64"))]
1154    let use_simd = false;
1155
1156    // Single buffer: translate chunk→buf, then squeeze in-place within buf
1157    let mut buf = vec![0u8; BUF_SIZE];
1158    let mut last_squeezed: u16 = 256;
1159
1160    for chunk in data.chunks(BUF_SIZE) {
1161        // Phase 1: Translate into buf (may use SIMD)
1162        translate_chunk_dispatch(chunk, &mut buf[..chunk.len()], &table, &kind, use_simd);
1163
1164        // Phase 2: Squeeze in-place (wp <= i always, safe for overlapping writes)
1165        let mut wp = 0;
1166        unsafe {
1167            let ptr = buf.as_mut_ptr();
1168            for i in 0..chunk.len() {
1169                let b = *ptr.add(i);
1170                if is_member(&squeeze_set, b) {
1171                    if last_squeezed == b as u16 {
1172                        continue;
1173                    }
1174                    last_squeezed = b as u16;
1175                } else {
1176                    last_squeezed = 256;
1177                }
1178                *ptr.add(wp) = b;
1179                wp += 1;
1180            }
1181        }
1182        writer.write_all(&buf[..wp])?;
1183    }
1184    Ok(())
1185}
1186
1187/// Delete from mmap'd byte slice.
1188/// Uses SIMD memchr for single-character delete (common case).
1189/// For multi-char delete, uses 8-byte unrolled scan with bitset lookup.
1190pub fn delete_mmap(delete_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1191    // Fast path: single character delete uses SIMD memchr
1192    if delete_chars.len() == 1 {
1193        return delete_single_char_mmap(delete_chars[0], data, writer);
1194    }
1195
1196    // Fast path: 2-char delete uses SIMD memchr2 (bulk copy between matches)
1197    if delete_chars.len() == 2 {
1198        return delete_multi_memchr_mmap::<2>(delete_chars, data, writer);
1199    }
1200
1201    // Fast path: 3-char delete uses SIMD memchr3 (bulk copy between matches)
1202    if delete_chars.len() == 3 {
1203        return delete_multi_memchr_mmap::<3>(delete_chars, data, writer);
1204    }
1205
1206    let member = build_member_set(delete_chars);
1207    let mut outbuf = vec![0u8; BUF_SIZE];
1208
1209    for chunk in data.chunks(BUF_SIZE) {
1210        let mut out_pos = 0;
1211        let len = chunk.len();
1212        let mut i = 0;
1213
1214        // 8-byte unrolled scan for better ILP
1215        while i + 8 <= len {
1216            unsafe {
1217                let b0 = *chunk.get_unchecked(i);
1218                let b1 = *chunk.get_unchecked(i + 1);
1219                let b2 = *chunk.get_unchecked(i + 2);
1220                let b3 = *chunk.get_unchecked(i + 3);
1221                let b4 = *chunk.get_unchecked(i + 4);
1222                let b5 = *chunk.get_unchecked(i + 5);
1223                let b6 = *chunk.get_unchecked(i + 6);
1224                let b7 = *chunk.get_unchecked(i + 7);
1225
1226                if !is_member(&member, b0) {
1227                    *outbuf.get_unchecked_mut(out_pos) = b0;
1228                    out_pos += 1;
1229                }
1230                if !is_member(&member, b1) {
1231                    *outbuf.get_unchecked_mut(out_pos) = b1;
1232                    out_pos += 1;
1233                }
1234                if !is_member(&member, b2) {
1235                    *outbuf.get_unchecked_mut(out_pos) = b2;
1236                    out_pos += 1;
1237                }
1238                if !is_member(&member, b3) {
1239                    *outbuf.get_unchecked_mut(out_pos) = b3;
1240                    out_pos += 1;
1241                }
1242                if !is_member(&member, b4) {
1243                    *outbuf.get_unchecked_mut(out_pos) = b4;
1244                    out_pos += 1;
1245                }
1246                if !is_member(&member, b5) {
1247                    *outbuf.get_unchecked_mut(out_pos) = b5;
1248                    out_pos += 1;
1249                }
1250                if !is_member(&member, b6) {
1251                    *outbuf.get_unchecked_mut(out_pos) = b6;
1252                    out_pos += 1;
1253                }
1254                if !is_member(&member, b7) {
1255                    *outbuf.get_unchecked_mut(out_pos) = b7;
1256                    out_pos += 1;
1257                }
1258            }
1259            i += 8;
1260        }
1261
1262        while i < len {
1263            unsafe {
1264                let b = *chunk.get_unchecked(i);
1265                if !is_member(&member, b) {
1266                    *outbuf.get_unchecked_mut(out_pos) = b;
1267                    out_pos += 1;
1268                }
1269            }
1270            i += 1;
1271        }
1272
1273        writer.write_all(&outbuf[..out_pos])?;
1274    }
1275    Ok(())
1276}
1277
1278/// Multi-character delete (2-3 chars) using SIMD memchr2/memchr3.
1279/// Chunked: processes 1MB at a time into contiguous output buffer, single write_all per chunk.
1280/// Eliminates millions of small BufWriter write_all calls.
1281fn delete_multi_memchr_mmap<const N: usize>(
1282    chars: &[u8],
1283    data: &[u8],
1284    writer: &mut impl Write,
1285) -> io::Result<()> {
1286    let mut outbuf = vec![0u8; BUF_SIZE];
1287
1288    for chunk in data.chunks(BUF_SIZE) {
1289        let mut wp = 0;
1290        let mut last = 0;
1291
1292        macro_rules! process_iter {
1293            ($iter:expr) => {
1294                for pos in $iter {
1295                    if pos > last {
1296                        let run = pos - last;
1297                        outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
1298                        wp += run;
1299                    }
1300                    last = pos + 1;
1301                }
1302            };
1303        }
1304
1305        if N == 2 {
1306            process_iter!(memchr::memchr2_iter(chars[0], chars[1], chunk));
1307        } else {
1308            process_iter!(memchr::memchr3_iter(chars[0], chars[1], chars[2], chunk));
1309        }
1310
1311        if last < chunk.len() {
1312            let run = chunk.len() - last;
1313            outbuf[wp..wp + run].copy_from_slice(&chunk[last..]);
1314            wp += run;
1315        }
1316        writer.write_all(&outbuf[..wp])?;
1317    }
1318    Ok(())
1319}
1320
1321/// Single-character delete from mmap using SIMD memchr.
1322/// Chunked: processes 1MB at a time into contiguous output buffer, single write_all per chunk.
1323/// Uses memchr_iter (precomputed SIMD state for entire chunk) + bulk copy_from_slice.
1324fn delete_single_char_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1325    let mut outbuf = vec![0u8; BUF_SIZE];
1326
1327    for chunk in data.chunks(BUF_SIZE) {
1328        let mut wp = 0;
1329        let mut last = 0;
1330        for pos in memchr::memchr_iter(ch, chunk) {
1331            if pos > last {
1332                let run = pos - last;
1333                outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
1334                wp += run;
1335            }
1336            last = pos + 1;
1337        }
1338        if last < chunk.len() {
1339            let run = chunk.len() - last;
1340            outbuf[wp..wp + run].copy_from_slice(&chunk[last..]);
1341            wp += run;
1342        }
1343        writer.write_all(&outbuf[..wp])?;
1344    }
1345    Ok(())
1346}
1347
1348/// Delete + squeeze from mmap'd byte slice.
1349pub fn delete_squeeze_mmap(
1350    delete_chars: &[u8],
1351    squeeze_chars: &[u8],
1352    data: &[u8],
1353    writer: &mut impl Write,
1354) -> io::Result<()> {
1355    let delete_set = build_member_set(delete_chars);
1356    let squeeze_set = build_member_set(squeeze_chars);
1357    let mut outbuf = vec![0u8; BUF_SIZE];
1358    let mut last_squeezed: u16 = 256;
1359
1360    for chunk in data.chunks(BUF_SIZE) {
1361        let mut out_pos = 0;
1362        for &b in chunk {
1363            if is_member(&delete_set, b) {
1364                continue;
1365            }
1366            if is_member(&squeeze_set, b) {
1367                if last_squeezed == b as u16 {
1368                    continue;
1369                }
1370                last_squeezed = b as u16;
1371            } else {
1372                last_squeezed = 256;
1373            }
1374            unsafe {
1375                *outbuf.get_unchecked_mut(out_pos) = b;
1376            }
1377            out_pos += 1;
1378        }
1379        writer.write_all(&outbuf[..out_pos])?;
1380    }
1381    Ok(())
1382}
1383
1384/// Squeeze from mmap'd byte slice.
1385/// Uses a two-pass approach: find runs of squeezable bytes with memchr,
1386/// then copy non-squeezed content in bulk.
1387pub fn squeeze_mmap(squeeze_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1388    // Fast path: single squeeze character — use SIMD memchr to find runs
1389    if squeeze_chars.len() == 1 {
1390        return squeeze_single_mmap(squeeze_chars[0], data, writer);
1391    }
1392
1393    // Fast path: 2-3 squeeze chars — use memchr2/memchr3 for SIMD scanning
1394    if squeeze_chars.len() == 2 {
1395        return squeeze_multi_mmap::<2>(squeeze_chars, data, writer);
1396    }
1397    if squeeze_chars.len() == 3 {
1398        return squeeze_multi_mmap::<3>(squeeze_chars, data, writer);
1399    }
1400
1401    // General path: chunked output buffer with member check
1402    let member = build_member_set(squeeze_chars);
1403    let mut outbuf = vec![0u8; BUF_SIZE];
1404    let mut last_squeezed: u16 = 256;
1405
1406    for chunk in data.chunks(BUF_SIZE) {
1407        let len = chunk.len();
1408        let mut wp = 0;
1409        let mut i = 0;
1410
1411        unsafe {
1412            let inp = chunk.as_ptr();
1413            let outp = outbuf.as_mut_ptr();
1414
1415            while i < len {
1416                let b = *inp.add(i);
1417                if is_member(&member, b) {
1418                    if last_squeezed != b as u16 {
1419                        *outp.add(wp) = b;
1420                        wp += 1;
1421                        last_squeezed = b as u16;
1422                    }
1423                    i += 1;
1424                    // Skip consecutive duplicates
1425                    while i < len && *inp.add(i) == b {
1426                        i += 1;
1427                    }
1428                } else {
1429                    last_squeezed = 256;
1430                    *outp.add(wp) = b;
1431                    wp += 1;
1432                    i += 1;
1433                }
1434            }
1435        }
1436        writer.write_all(&outbuf[..wp])?;
1437    }
1438    Ok(())
1439}
1440
1441/// Squeeze with 2-3 char sets using SIMD memchr2/memchr3 for fast scanning.
1442/// Batched: copies into output buffer, single write_all per buffer fill.
1443fn squeeze_multi_mmap<const N: usize>(
1444    chars: &[u8],
1445    data: &[u8],
1446    writer: &mut impl Write,
1447) -> io::Result<()> {
1448    let mut outbuf = vec![0u8; BUF_SIZE];
1449    let mut wp = 0;
1450    let mut last_squeezed: u16 = 256;
1451    let mut cursor = 0;
1452
1453    macro_rules! find_next {
1454        ($data:expr) => {
1455            if N == 2 {
1456                memchr::memchr2(chars[0], chars[1], $data)
1457            } else {
1458                memchr::memchr3(chars[0], chars[1], chars[2], $data)
1459            }
1460        };
1461    }
1462
1463    macro_rules! flush_and_copy {
1464        ($src:expr, $len:expr) => {
1465            if wp + $len > BUF_SIZE {
1466                writer.write_all(&outbuf[..wp])?;
1467                wp = 0;
1468            }
1469            if $len > BUF_SIZE {
1470                writer.write_all($src)?;
1471            } else {
1472                outbuf[wp..wp + $len].copy_from_slice($src);
1473                wp += $len;
1474            }
1475        };
1476    }
1477
1478    while cursor < data.len() {
1479        match find_next!(&data[cursor..]) {
1480            Some(offset) => {
1481                let pos = cursor + offset;
1482                let b = data[pos];
1483                // Copy non-member span + first squeeze char to output buffer
1484                if pos > cursor {
1485                    let span = pos - cursor;
1486                    flush_and_copy!(&data[cursor..pos], span);
1487                    last_squeezed = 256;
1488                }
1489                if last_squeezed != b as u16 {
1490                    if wp >= BUF_SIZE {
1491                        writer.write_all(&outbuf[..wp])?;
1492                        wp = 0;
1493                    }
1494                    outbuf[wp] = b;
1495                    wp += 1;
1496                    last_squeezed = b as u16;
1497                }
1498                // Skip consecutive duplicates of same byte
1499                let mut skip = pos + 1;
1500                while skip < data.len() && data[skip] == b {
1501                    skip += 1;
1502                }
1503                cursor = skip;
1504            }
1505            None => {
1506                let remaining = data.len() - cursor;
1507                flush_and_copy!(&data[cursor..], remaining);
1508                break;
1509            }
1510        }
1511    }
1512    if wp > 0 {
1513        writer.write_all(&outbuf[..wp])?;
1514    }
1515    Ok(())
1516}
1517
1518/// Squeeze a single repeated character from mmap'd data.
1519/// Batched: copies non-duplicate runs into output buffer, single write_all per buffer fill.
1520/// Uses SIMD memchr for bulk-skip between occurrences of the squeeze char.
1521fn squeeze_single_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1522    let mut outbuf = vec![0u8; BUF_SIZE];
1523    let mut wp = 0;
1524    let mut cursor = 0;
1525
1526    while cursor < data.len() {
1527        match memchr::memchr(ch, &data[cursor..]) {
1528            Some(offset) => {
1529                let pos = cursor + offset;
1530                let run = offset + 1; // include first squeeze char
1531
1532                // Flush if output buffer can't hold the run
1533                if wp + run > BUF_SIZE {
1534                    writer.write_all(&outbuf[..wp])?;
1535                    wp = 0;
1536                }
1537
1538                outbuf[wp..wp + run].copy_from_slice(&data[cursor..pos + 1]);
1539                wp += run;
1540
1541                // Skip consecutive duplicates
1542                let mut skip = pos + 1;
1543                while skip < data.len() && data[skip] == ch {
1544                    skip += 1;
1545                }
1546                cursor = skip;
1547            }
1548            None => {
1549                let remaining = data.len() - cursor;
1550                if wp + remaining > BUF_SIZE {
1551                    writer.write_all(&outbuf[..wp])?;
1552                    wp = 0;
1553                }
1554                if remaining > BUF_SIZE {
1555                    // Remaining data is larger than buffer — write directly
1556                    writer.write_all(&data[cursor..])?;
1557                } else {
1558                    outbuf[wp..wp + remaining].copy_from_slice(&data[cursor..]);
1559                    wp += remaining;
1560                }
1561                break;
1562            }
1563        }
1564    }
1565    if wp > 0 {
1566        writer.write_all(&outbuf[..wp])?;
1567    }
1568    Ok(())
1569}