Skip to main content

coreutils_rs/tr/
core.rs

1use std::io::{self, Read, Write};
2
3use rayon::prelude::*;
4
5const BUF_SIZE: usize = 1024 * 1024; // 1MB — fits L2/L3 cache for locality
6
7/// Stream buffer: 256KB — process data immediately after each read().
8/// Stays in L2 cache, matches typical kernel pipe buffer size.
9/// Unlike fill_buf (which loops to accumulate 8MB = ~128 syscalls for pipes),
10/// we read once and process immediately, matching GNU tr's approach.
11const STREAM_BUF: usize = 256 * 1024;
12
13/// Minimum size for parallel translation.
14/// Lowered to 4MB since rayon overhead is small (~10μs) compared to
15/// translation time per chunk (~400μs at 10GB/s).
16const PARALLEL_TRANSLATE_THRESHOLD: usize = 4 * 1024 * 1024;
17
18/// Build a 256-byte lookup table mapping set1[i] -> set2[i].
19#[inline]
20fn build_translate_table(set1: &[u8], set2: &[u8]) -> [u8; 256] {
21    let mut table: [u8; 256] = std::array::from_fn(|i| i as u8);
22    let last = set2.last().copied();
23    for (i, &from) in set1.iter().enumerate() {
24        table[from as usize] = if i < set2.len() {
25            set2[i]
26        } else {
27            last.unwrap_or(from)
28        };
29    }
30    table
31}
32
33/// Build a 256-bit (32-byte) membership set for O(1) byte lookup.
34#[inline]
35fn build_member_set(chars: &[u8]) -> [u8; 32] {
36    let mut set = [0u8; 32];
37    for &ch in chars {
38        set[ch as usize >> 3] |= 1 << (ch & 7);
39    }
40    set
41}
42
43#[inline(always)]
44fn is_member(set: &[u8; 32], ch: u8) -> bool {
45    unsafe { (*set.get_unchecked(ch as usize >> 3) & (1 << (ch & 7))) != 0 }
46}
47
48// ============================================================================
49// Table analysis for SIMD fast paths
50// ============================================================================
51
52/// Classification of a translation table for SIMD optimization.
53#[allow(dead_code)] // Fields used only on x86_64 (AVX2 SIMD path)
54enum TranslateKind {
55    /// Table is identity — no translation needed.
56    Identity,
57    /// A contiguous range [lo, hi] with constant wrapping-add delta; identity elsewhere.
58    RangeDelta { lo: u8, hi: u8, delta: u8 },
59    /// Arbitrary translation — use general lookup table.
60    General,
61}
62
63/// Analyze a translation table to detect SIMD-optimizable patterns.
64fn analyze_table(table: &[u8; 256]) -> TranslateKind {
65    let mut first_delta: Option<u8> = None;
66    let mut lo: u8 = 0;
67    let mut hi: u8 = 0;
68    let mut count: u32 = 0;
69
70    for i in 0..256u16 {
71        let actual = table[i as usize];
72        if actual != i as u8 {
73            let delta = actual.wrapping_sub(i as u8);
74            count += 1;
75            match first_delta {
76                None => {
77                    first_delta = Some(delta);
78                    lo = i as u8;
79                    hi = i as u8;
80                }
81                Some(d) if d == delta => {
82                    hi = i as u8;
83                }
84                _ => return TranslateKind::General,
85            }
86        }
87    }
88
89    match (count, first_delta) {
90        (0, _) => TranslateKind::Identity,
91        (c, Some(delta)) if c == (hi as u32 - lo as u32 + 1) => {
92            TranslateKind::RangeDelta { lo, hi, delta }
93        }
94        _ => TranslateKind::General,
95    }
96}
97
98// ============================================================================
99// SIMD translation (x86_64 AVX2)
100// ============================================================================
101
102#[cfg(target_arch = "x86_64")]
103mod simd_tr {
104    /// Translate bytes in range [lo, hi] by adding `delta` (wrapping), leave others unchanged.
105    /// Processes 128 bytes per iteration (4x unroll) using AVX2.
106    ///
107    /// SAFETY: Caller must ensure AVX2 is available and out.len() >= data.len().
108    #[target_feature(enable = "avx2")]
109    pub unsafe fn range_delta(data: &[u8], out: &mut [u8], lo: u8, hi: u8, delta: u8) {
110        unsafe {
111            use std::arch::x86_64::*;
112
113            let lo_vec = _mm256_set1_epi8(lo as i8);
114            let range_vec = _mm256_set1_epi8((hi - lo) as i8);
115            let delta_vec = _mm256_set1_epi8(delta as i8);
116
117            let len = data.len();
118            let inp = data.as_ptr();
119            let outp = out.as_mut_ptr();
120            let mut i = 0usize;
121
122            // 4x unrolled: process 128 bytes per iteration for better ILP
123            while i + 128 <= len {
124                let v0 = _mm256_loadu_si256(inp.add(i) as *const __m256i);
125                let v1 = _mm256_loadu_si256(inp.add(i + 32) as *const __m256i);
126                let v2 = _mm256_loadu_si256(inp.add(i + 64) as *const __m256i);
127                let v3 = _mm256_loadu_si256(inp.add(i + 96) as *const __m256i);
128
129                let d0 = _mm256_sub_epi8(v0, lo_vec);
130                let d1 = _mm256_sub_epi8(v1, lo_vec);
131                let d2 = _mm256_sub_epi8(v2, lo_vec);
132                let d3 = _mm256_sub_epi8(v3, lo_vec);
133
134                let m0 = _mm256_cmpeq_epi8(_mm256_min_epu8(d0, range_vec), d0);
135                let m1 = _mm256_cmpeq_epi8(_mm256_min_epu8(d1, range_vec), d1);
136                let m2 = _mm256_cmpeq_epi8(_mm256_min_epu8(d2, range_vec), d2);
137                let m3 = _mm256_cmpeq_epi8(_mm256_min_epu8(d3, range_vec), d3);
138
139                let r0 = _mm256_add_epi8(v0, _mm256_and_si256(m0, delta_vec));
140                let r1 = _mm256_add_epi8(v1, _mm256_and_si256(m1, delta_vec));
141                let r2 = _mm256_add_epi8(v2, _mm256_and_si256(m2, delta_vec));
142                let r3 = _mm256_add_epi8(v3, _mm256_and_si256(m3, delta_vec));
143
144                _mm256_storeu_si256(outp.add(i) as *mut __m256i, r0);
145                _mm256_storeu_si256(outp.add(i + 32) as *mut __m256i, r1);
146                _mm256_storeu_si256(outp.add(i + 64) as *mut __m256i, r2);
147                _mm256_storeu_si256(outp.add(i + 96) as *mut __m256i, r3);
148                i += 128;
149            }
150
151            while i + 32 <= len {
152                let v = _mm256_loadu_si256(inp.add(i) as *const __m256i);
153                let diff = _mm256_sub_epi8(v, lo_vec);
154                let mask = _mm256_cmpeq_epi8(_mm256_min_epu8(diff, range_vec), diff);
155                let result = _mm256_add_epi8(v, _mm256_and_si256(mask, delta_vec));
156                _mm256_storeu_si256(outp.add(i) as *mut __m256i, result);
157                i += 32;
158            }
159
160            while i < len {
161                let b = *inp.add(i);
162                *outp.add(i) = if b.wrapping_sub(lo) <= (hi - lo) {
163                    b.wrapping_add(delta)
164                } else {
165                    b
166                };
167                i += 1;
168            }
169        }
170    }
171
172    /// General 256-byte table lookup using 16-way vpshufb (nibble decomposition).
173    /// Splits each input byte into high nibble (selects one of 16 shuffle tables)
174    /// and low nibble (index within the shuffle table). Processes 64 bytes per
175    /// iteration (2x unrolled) for instruction-level parallelism.
176    ///
177    /// SAFETY: Caller must ensure AVX2 is available and out.len() >= data.len().
178    #[target_feature(enable = "avx2")]
179    pub unsafe fn general_lookup(data: &[u8], out: &mut [u8], table: &[u8; 256]) {
180        unsafe {
181            use std::arch::x86_64::*;
182
183            // Build 16 vpshufb LUTs from the 256-byte translation table.
184            // LUT[h] covers table[h*16..h*16+16], broadcast to both 128-bit lanes.
185            let tp = table.as_ptr();
186            let mut luts = [_mm256_setzero_si256(); 16];
187            let mut h = 0;
188            while h < 16 {
189                luts[h] =
190                    _mm256_broadcastsi128_si256(_mm_loadu_si128(tp.add(h * 16) as *const __m128i));
191                h += 1;
192            }
193
194            let lo_mask = _mm256_set1_epi8(0x0F);
195            let len = data.len();
196            let inp = data.as_ptr();
197            let outp = out.as_mut_ptr();
198            let mut i = 0;
199
200            // 2x unrolled: process 64 bytes per iteration
201            while i + 64 <= len {
202                let v0 = _mm256_loadu_si256(inp.add(i) as *const __m256i);
203                let v1 = _mm256_loadu_si256(inp.add(i + 32) as *const __m256i);
204
205                let lo0 = _mm256_and_si256(v0, lo_mask);
206                let lo1 = _mm256_and_si256(v1, lo_mask);
207                let hi0 = _mm256_and_si256(_mm256_srli_epi16(v0, 4), lo_mask);
208                let hi1 = _mm256_and_si256(_mm256_srli_epi16(v1, 4), lo_mask);
209
210                let mut r0 = _mm256_setzero_si256();
211                let mut r1 = _mm256_setzero_si256();
212
213                // Process all 16 high-nibble values.
214                // For each h, bytes where high_nibble == h get their result from luts[h].
215                macro_rules! do_nib {
216                    ($h:literal) => {
217                        let hv = _mm256_set1_epi8($h);
218                        let lut = luts[$h as usize];
219                        let m0 = _mm256_cmpeq_epi8(hi0, hv);
220                        let m1 = _mm256_cmpeq_epi8(hi1, hv);
221                        r0 = _mm256_or_si256(
222                            r0,
223                            _mm256_and_si256(_mm256_shuffle_epi8(lut, lo0), m0),
224                        );
225                        r1 = _mm256_or_si256(
226                            r1,
227                            _mm256_and_si256(_mm256_shuffle_epi8(lut, lo1), m1),
228                        );
229                    };
230                }
231
232                do_nib!(0);
233                do_nib!(1);
234                do_nib!(2);
235                do_nib!(3);
236                do_nib!(4);
237                do_nib!(5);
238                do_nib!(6);
239                do_nib!(7);
240                do_nib!(8);
241                do_nib!(9);
242                do_nib!(10);
243                do_nib!(11);
244                do_nib!(12);
245                do_nib!(13);
246                do_nib!(14);
247                do_nib!(15);
248
249                _mm256_storeu_si256(outp.add(i) as *mut __m256i, r0);
250                _mm256_storeu_si256(outp.add(i + 32) as *mut __m256i, r1);
251                i += 64;
252            }
253
254            // Single vector tail
255            while i + 32 <= len {
256                let v = _mm256_loadu_si256(inp.add(i) as *const __m256i);
257                let lo = _mm256_and_si256(v, lo_mask);
258                let hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), lo_mask);
259
260                let mut result = _mm256_setzero_si256();
261                let mut hh = 0u8;
262                while hh < 16 {
263                    let hv = _mm256_set1_epi8(hh as i8);
264                    let m = _mm256_cmpeq_epi8(hi, hv);
265                    result = _mm256_or_si256(
266                        result,
267                        _mm256_and_si256(_mm256_shuffle_epi8(luts[hh as usize], lo), m),
268                    );
269                    hh += 1;
270                }
271
272                _mm256_storeu_si256(outp.add(i) as *mut __m256i, result);
273                i += 32;
274            }
275
276            // Scalar tail
277            while i < len {
278                *outp.add(i) = *table.get_unchecked(*inp.add(i) as usize);
279                i += 1;
280            }
281        }
282    }
283
284    /// In-place general 256-byte table lookup using 16-way vpshufb.
285    ///
286    /// SAFETY: Caller must ensure AVX2 is available.
287    #[target_feature(enable = "avx2")]
288    pub unsafe fn general_lookup_inplace(data: &mut [u8], table: &[u8; 256]) {
289        unsafe {
290            use std::arch::x86_64::*;
291
292            let tp = table.as_ptr();
293            let mut luts = [_mm256_setzero_si256(); 16];
294            let mut h = 0;
295            while h < 16 {
296                luts[h] =
297                    _mm256_broadcastsi128_si256(_mm_loadu_si128(tp.add(h * 16) as *const __m128i));
298                h += 1;
299            }
300
301            let lo_mask = _mm256_set1_epi8(0x0F);
302            let len = data.len();
303            let ptr = data.as_mut_ptr();
304            let mut i = 0;
305
306            while i + 64 <= len {
307                let v0 = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
308                let v1 = _mm256_loadu_si256(ptr.add(i + 32) as *const __m256i);
309
310                let lo0 = _mm256_and_si256(v0, lo_mask);
311                let lo1 = _mm256_and_si256(v1, lo_mask);
312                let hi0 = _mm256_and_si256(_mm256_srli_epi16(v0, 4), lo_mask);
313                let hi1 = _mm256_and_si256(_mm256_srli_epi16(v1, 4), lo_mask);
314
315                let mut r0 = _mm256_setzero_si256();
316                let mut r1 = _mm256_setzero_si256();
317
318                macro_rules! do_nib {
319                    ($h:literal) => {
320                        let hv = _mm256_set1_epi8($h);
321                        let lut = luts[$h as usize];
322                        let m0 = _mm256_cmpeq_epi8(hi0, hv);
323                        let m1 = _mm256_cmpeq_epi8(hi1, hv);
324                        r0 = _mm256_or_si256(
325                            r0,
326                            _mm256_and_si256(_mm256_shuffle_epi8(lut, lo0), m0),
327                        );
328                        r1 = _mm256_or_si256(
329                            r1,
330                            _mm256_and_si256(_mm256_shuffle_epi8(lut, lo1), m1),
331                        );
332                    };
333                }
334
335                do_nib!(0);
336                do_nib!(1);
337                do_nib!(2);
338                do_nib!(3);
339                do_nib!(4);
340                do_nib!(5);
341                do_nib!(6);
342                do_nib!(7);
343                do_nib!(8);
344                do_nib!(9);
345                do_nib!(10);
346                do_nib!(11);
347                do_nib!(12);
348                do_nib!(13);
349                do_nib!(14);
350                do_nib!(15);
351
352                _mm256_storeu_si256(ptr.add(i) as *mut __m256i, r0);
353                _mm256_storeu_si256(ptr.add(i + 32) as *mut __m256i, r1);
354                i += 64;
355            }
356
357            while i + 32 <= len {
358                let v = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
359                let lo = _mm256_and_si256(v, lo_mask);
360                let hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), lo_mask);
361
362                let mut result = _mm256_setzero_si256();
363                let mut hh = 0u8;
364                while hh < 16 {
365                    let hv = _mm256_set1_epi8(hh as i8);
366                    let m = _mm256_cmpeq_epi8(hi, hv);
367                    result = _mm256_or_si256(
368                        result,
369                        _mm256_and_si256(_mm256_shuffle_epi8(luts[hh as usize], lo), m),
370                    );
371                    hh += 1;
372                }
373
374                _mm256_storeu_si256(ptr.add(i) as *mut __m256i, result);
375                i += 32;
376            }
377
378            while i < len {
379                let b = *ptr.add(i);
380                *ptr.add(i) = *table.get_unchecked(b as usize);
381                i += 1;
382            }
383        }
384    }
385
386    /// In-place SIMD translate for stdin path.
387    ///
388    /// SAFETY: Caller must ensure AVX2 is available.
389    #[target_feature(enable = "avx2")]
390    pub unsafe fn range_delta_inplace(data: &mut [u8], lo: u8, hi: u8, delta: u8) {
391        unsafe {
392            use std::arch::x86_64::*;
393
394            let lo_vec = _mm256_set1_epi8(lo as i8);
395            let range_vec = _mm256_set1_epi8((hi - lo) as i8);
396            let delta_vec = _mm256_set1_epi8(delta as i8);
397
398            let len = data.len();
399            let ptr = data.as_mut_ptr();
400            let mut i = 0usize;
401
402            while i + 128 <= len {
403                let v0 = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
404                let v1 = _mm256_loadu_si256(ptr.add(i + 32) as *const __m256i);
405                let v2 = _mm256_loadu_si256(ptr.add(i + 64) as *const __m256i);
406                let v3 = _mm256_loadu_si256(ptr.add(i + 96) as *const __m256i);
407
408                let d0 = _mm256_sub_epi8(v0, lo_vec);
409                let d1 = _mm256_sub_epi8(v1, lo_vec);
410                let d2 = _mm256_sub_epi8(v2, lo_vec);
411                let d3 = _mm256_sub_epi8(v3, lo_vec);
412
413                let m0 = _mm256_cmpeq_epi8(_mm256_min_epu8(d0, range_vec), d0);
414                let m1 = _mm256_cmpeq_epi8(_mm256_min_epu8(d1, range_vec), d1);
415                let m2 = _mm256_cmpeq_epi8(_mm256_min_epu8(d2, range_vec), d2);
416                let m3 = _mm256_cmpeq_epi8(_mm256_min_epu8(d3, range_vec), d3);
417
418                let r0 = _mm256_add_epi8(v0, _mm256_and_si256(m0, delta_vec));
419                let r1 = _mm256_add_epi8(v1, _mm256_and_si256(m1, delta_vec));
420                let r2 = _mm256_add_epi8(v2, _mm256_and_si256(m2, delta_vec));
421                let r3 = _mm256_add_epi8(v3, _mm256_and_si256(m3, delta_vec));
422
423                _mm256_storeu_si256(ptr.add(i) as *mut __m256i, r0);
424                _mm256_storeu_si256(ptr.add(i + 32) as *mut __m256i, r1);
425                _mm256_storeu_si256(ptr.add(i + 64) as *mut __m256i, r2);
426                _mm256_storeu_si256(ptr.add(i + 96) as *mut __m256i, r3);
427                i += 128;
428            }
429
430            while i + 32 <= len {
431                let v = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
432                let diff = _mm256_sub_epi8(v, lo_vec);
433                let mask = _mm256_cmpeq_epi8(_mm256_min_epu8(diff, range_vec), diff);
434                let result = _mm256_add_epi8(v, _mm256_and_si256(mask, delta_vec));
435                _mm256_storeu_si256(ptr.add(i) as *mut __m256i, result);
436                i += 32;
437            }
438
439            while i < len {
440                let b = *ptr.add(i);
441                *ptr.add(i) = if b.wrapping_sub(lo) <= (hi - lo) {
442                    b.wrapping_add(delta)
443                } else {
444                    b
445                };
446                i += 1;
447            }
448        }
449    }
450}
451
452/// AVX2 nibble-based set membership classifier.
453/// Uses vpshufb to test 32 bytes at a time for membership in a byte set.
454/// Returns a `__m256i` where each byte is 0xFF if the byte is NOT in the set, 0x00 if it IS.
455/// Check if AVX2 is available at runtime.
456#[cfg(target_arch = "x86_64")]
457#[inline(always)]
458fn has_avx2() -> bool {
459    is_x86_feature_detected!("avx2")
460}
461
462/// Translate a chunk of bytes using a lookup table — unrolled 8-byte inner loop.
463#[inline(always)]
464fn translate_chunk(chunk: &[u8], out: &mut [u8], table: &[u8; 256]) {
465    let len = chunk.len();
466    let mut i = 0;
467    while i + 8 <= len {
468        unsafe {
469            *out.get_unchecked_mut(i) = *table.get_unchecked(*chunk.get_unchecked(i) as usize);
470            *out.get_unchecked_mut(i + 1) =
471                *table.get_unchecked(*chunk.get_unchecked(i + 1) as usize);
472            *out.get_unchecked_mut(i + 2) =
473                *table.get_unchecked(*chunk.get_unchecked(i + 2) as usize);
474            *out.get_unchecked_mut(i + 3) =
475                *table.get_unchecked(*chunk.get_unchecked(i + 3) as usize);
476            *out.get_unchecked_mut(i + 4) =
477                *table.get_unchecked(*chunk.get_unchecked(i + 4) as usize);
478            *out.get_unchecked_mut(i + 5) =
479                *table.get_unchecked(*chunk.get_unchecked(i + 5) as usize);
480            *out.get_unchecked_mut(i + 6) =
481                *table.get_unchecked(*chunk.get_unchecked(i + 6) as usize);
482            *out.get_unchecked_mut(i + 7) =
483                *table.get_unchecked(*chunk.get_unchecked(i + 7) as usize);
484        }
485        i += 8;
486    }
487    while i < len {
488        unsafe {
489            *out.get_unchecked_mut(i) = *table.get_unchecked(*chunk.get_unchecked(i) as usize);
490        }
491        i += 1;
492    }
493}
494
495/// In-place translate for stdin path — avoids separate output buffer.
496#[inline(always)]
497fn translate_inplace(data: &mut [u8], table: &[u8; 256]) {
498    let len = data.len();
499    let ptr = data.as_mut_ptr();
500    let tab = table.as_ptr();
501
502    unsafe {
503        let mut i = 0;
504        while i + 8 <= len {
505            *ptr.add(i) = *tab.add(*ptr.add(i) as usize);
506            *ptr.add(i + 1) = *tab.add(*ptr.add(i + 1) as usize);
507            *ptr.add(i + 2) = *tab.add(*ptr.add(i + 2) as usize);
508            *ptr.add(i + 3) = *tab.add(*ptr.add(i + 3) as usize);
509            *ptr.add(i + 4) = *tab.add(*ptr.add(i + 4) as usize);
510            *ptr.add(i + 5) = *tab.add(*ptr.add(i + 5) as usize);
511            *ptr.add(i + 6) = *tab.add(*ptr.add(i + 6) as usize);
512            *ptr.add(i + 7) = *tab.add(*ptr.add(i + 7) as usize);
513            i += 8;
514        }
515        while i < len {
516            *ptr.add(i) = *tab.add(*ptr.add(i) as usize);
517            i += 1;
518        }
519    }
520}
521
522// ============================================================================
523// Dispatch: choose SIMD or scalar based on table analysis
524// ============================================================================
525
526/// Translate a chunk using the best available method.
527/// `use_simd` is cached at call site to avoid per-chunk atomic loads.
528#[inline]
529fn translate_chunk_dispatch(
530    chunk: &[u8],
531    out: &mut [u8],
532    table: &[u8; 256],
533    kind: &TranslateKind,
534    _use_simd: bool,
535) {
536    match kind {
537        TranslateKind::Identity => {
538            out[..chunk.len()].copy_from_slice(chunk);
539        }
540        #[cfg(target_arch = "x86_64")]
541        TranslateKind::RangeDelta { lo, hi, delta } => {
542            if _use_simd {
543                unsafe { simd_tr::range_delta(chunk, out, *lo, *hi, *delta) };
544                return;
545            }
546            translate_chunk(chunk, out, table);
547        }
548        #[cfg(not(target_arch = "x86_64"))]
549        TranslateKind::RangeDelta { .. } => {
550            translate_chunk(chunk, out, table);
551        }
552        #[cfg(target_arch = "x86_64")]
553        TranslateKind::General => {
554            if _use_simd {
555                unsafe { simd_tr::general_lookup(chunk, out, table) };
556                return;
557            }
558            translate_chunk(chunk, out, table);
559        }
560        #[cfg(not(target_arch = "x86_64"))]
561        TranslateKind::General => {
562            translate_chunk(chunk, out, table);
563        }
564    }
565}
566
567/// In-place translate dispatch.
568/// `use_simd` is cached at call site to avoid per-chunk atomic loads.
569#[inline]
570fn translate_inplace_dispatch(
571    data: &mut [u8],
572    table: &[u8; 256],
573    kind: &TranslateKind,
574    _use_simd: bool,
575) {
576    match kind {
577        TranslateKind::Identity => {}
578        #[cfg(target_arch = "x86_64")]
579        TranslateKind::RangeDelta { lo, hi, delta } => {
580            if _use_simd {
581                unsafe { simd_tr::range_delta_inplace(data, *lo, *hi, *delta) };
582                return;
583            }
584            translate_inplace(data, table);
585        }
586        #[cfg(not(target_arch = "x86_64"))]
587        TranslateKind::RangeDelta { .. } => {
588            translate_inplace(data, table);
589        }
590        #[cfg(target_arch = "x86_64")]
591        TranslateKind::General => {
592            if _use_simd {
593                unsafe { simd_tr::general_lookup_inplace(data, table) };
594                return;
595            }
596            translate_inplace(data, table);
597        }
598        #[cfg(not(target_arch = "x86_64"))]
599        TranslateKind::General => {
600            translate_inplace(data, table);
601        }
602    }
603}
604
605// ============================================================================
606// Streaming functions (Read + Write)
607// Process data immediately after each read() — no fill_buf accumulation.
608// Uses 256KB buffer (L2-friendly) instead of 8MB.
609// ============================================================================
610
611pub fn translate(
612    set1: &[u8],
613    set2: &[u8],
614    reader: &mut impl Read,
615    writer: &mut impl Write,
616) -> io::Result<()> {
617    let table = build_translate_table(set1, set2);
618    let kind = analyze_table(&table);
619    #[cfg(target_arch = "x86_64")]
620    let use_simd = has_avx2();
621    #[cfg(not(target_arch = "x86_64"))]
622    let use_simd = false;
623
624    let mut buf = vec![0u8; STREAM_BUF];
625    loop {
626        let n = match reader.read(&mut buf) {
627            Ok(0) => break,
628            Ok(n) => n,
629            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
630            Err(e) => return Err(e),
631        };
632        translate_inplace_dispatch(&mut buf[..n], &table, &kind, use_simd);
633        writer.write_all(&buf[..n])?;
634    }
635    Ok(())
636}
637
638pub fn translate_squeeze(
639    set1: &[u8],
640    set2: &[u8],
641    reader: &mut impl Read,
642    writer: &mut impl Write,
643) -> io::Result<()> {
644    let table = build_translate_table(set1, set2);
645    let squeeze_set = build_member_set(set2);
646    let mut outbuf = vec![0u8; STREAM_BUF];
647    let mut inbuf = vec![0u8; STREAM_BUF];
648    let mut last_squeezed: u16 = 256;
649
650    loop {
651        let n = match reader.read(&mut inbuf) {
652            Ok(0) => break,
653            Ok(n) => n,
654            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
655            Err(e) => return Err(e),
656        };
657        let mut out_pos = 0;
658        for &b in &inbuf[..n] {
659            let translated = unsafe { *table.get_unchecked(b as usize) };
660            if is_member(&squeeze_set, translated) {
661                if last_squeezed == translated as u16 {
662                    continue;
663                }
664                last_squeezed = translated as u16;
665            } else {
666                last_squeezed = 256;
667            }
668            unsafe {
669                *outbuf.get_unchecked_mut(out_pos) = translated;
670            }
671            out_pos += 1;
672        }
673        writer.write_all(&outbuf[..out_pos])?;
674    }
675    Ok(())
676}
677
678pub fn delete(
679    delete_chars: &[u8],
680    reader: &mut impl Read,
681    writer: &mut impl Write,
682) -> io::Result<()> {
683    // Fast path: single character delete using SIMD memchr
684    if delete_chars.len() == 1 {
685        return delete_single_streaming(delete_chars[0], reader, writer);
686    }
687
688    // Fast paths: 2-3 char delete using SIMD memchr2/memchr3
689    if delete_chars.len() <= 3 {
690        return delete_multi_streaming(delete_chars, reader, writer);
691    }
692
693    let member = build_member_set(delete_chars);
694    let mut outbuf = vec![0u8; STREAM_BUF];
695    let mut inbuf = vec![0u8; STREAM_BUF];
696
697    loop {
698        let n = match reader.read(&mut inbuf) {
699            Ok(0) => break,
700            Ok(n) => n,
701            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
702            Err(e) => return Err(e),
703        };
704        let mut out_pos = 0;
705        for &b in &inbuf[..n] {
706            if !is_member(&member, b) {
707                unsafe {
708                    *outbuf.get_unchecked_mut(out_pos) = b;
709                }
710                out_pos += 1;
711            }
712        }
713        writer.write_all(&outbuf[..out_pos])?;
714    }
715    Ok(())
716}
717
718/// Single-character delete from a reader using SIMD memchr scanning.
719fn delete_single_streaming(
720    ch: u8,
721    reader: &mut impl Read,
722    writer: &mut impl Write,
723) -> io::Result<()> {
724    let mut buf = vec![0u8; STREAM_BUF];
725    loop {
726        let n = match reader.read(&mut buf) {
727            Ok(0) => break,
728            Ok(n) => n,
729            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
730            Err(e) => return Err(e),
731        };
732        let chunk = &buf[..n];
733        let mut last = 0;
734        for pos in memchr::memchr_iter(ch, chunk) {
735            if pos > last {
736                writer.write_all(&chunk[last..pos])?;
737            }
738            last = pos + 1;
739        }
740        if last < n {
741            writer.write_all(&chunk[last..n])?;
742        }
743    }
744    Ok(())
745}
746
747/// Multi-character delete (2-3 chars) from a reader using SIMD memchr2/memchr3.
748fn delete_multi_streaming(
749    chars: &[u8],
750    reader: &mut impl Read,
751    writer: &mut impl Write,
752) -> io::Result<()> {
753    let mut buf = vec![0u8; STREAM_BUF];
754    loop {
755        let n = match reader.read(&mut buf) {
756            Ok(0) => break,
757            Ok(n) => n,
758            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
759            Err(e) => return Err(e),
760        };
761        let chunk = &buf[..n];
762        let mut last = 0;
763        if chars.len() == 2 {
764            for pos in memchr::memchr2_iter(chars[0], chars[1], chunk) {
765                if pos > last {
766                    writer.write_all(&chunk[last..pos])?;
767                }
768                last = pos + 1;
769            }
770        } else {
771            for pos in memchr::memchr3_iter(chars[0], chars[1], chars[2], chunk) {
772                if pos > last {
773                    writer.write_all(&chunk[last..pos])?;
774                }
775                last = pos + 1;
776            }
777        }
778        if last < n {
779            writer.write_all(&chunk[last..n])?;
780        }
781    }
782    Ok(())
783}
784
785pub fn delete_squeeze(
786    delete_chars: &[u8],
787    squeeze_chars: &[u8],
788    reader: &mut impl Read,
789    writer: &mut impl Write,
790) -> io::Result<()> {
791    let delete_set = build_member_set(delete_chars);
792    let squeeze_set = build_member_set(squeeze_chars);
793    let mut outbuf = vec![0u8; STREAM_BUF];
794    let mut inbuf = vec![0u8; STREAM_BUF];
795    let mut last_squeezed: u16 = 256;
796
797    loop {
798        let n = match reader.read(&mut inbuf) {
799            Ok(0) => break,
800            Ok(n) => n,
801            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
802            Err(e) => return Err(e),
803        };
804        let mut out_pos = 0;
805        for &b in &inbuf[..n] {
806            if is_member(&delete_set, b) {
807                continue;
808            }
809            if is_member(&squeeze_set, b) {
810                if last_squeezed == b as u16 {
811                    continue;
812                }
813                last_squeezed = b as u16;
814            } else {
815                last_squeezed = 256;
816            }
817            unsafe {
818                *outbuf.get_unchecked_mut(out_pos) = b;
819            }
820            out_pos += 1;
821        }
822        writer.write_all(&outbuf[..out_pos])?;
823    }
824    Ok(())
825}
826
827pub fn squeeze(
828    squeeze_chars: &[u8],
829    reader: &mut impl Read,
830    writer: &mut impl Write,
831) -> io::Result<()> {
832    // Fast path: single squeeze char — bulk copy non-match runs
833    if squeeze_chars.len() == 1 {
834        return squeeze_single_stream(squeeze_chars[0], reader, writer);
835    }
836
837    let member = build_member_set(squeeze_chars);
838    let mut outbuf = vec![0u8; STREAM_BUF];
839    let mut inbuf = vec![0u8; STREAM_BUF];
840    let mut last_squeezed: u16 = 256;
841
842    loop {
843        let n = match reader.read(&mut inbuf) {
844            Ok(0) => break,
845            Ok(n) => n,
846            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
847            Err(e) => return Err(e),
848        };
849        let mut out_pos = 0;
850        for &b in &inbuf[..n] {
851            if is_member(&member, b) {
852                if last_squeezed == b as u16 {
853                    continue;
854                }
855                last_squeezed = b as u16;
856            } else {
857                last_squeezed = 256;
858            }
859            unsafe {
860                *outbuf.get_unchecked_mut(out_pos) = b;
861            }
862            out_pos += 1;
863        }
864        writer.write_all(&outbuf[..out_pos])?;
865    }
866    Ok(())
867}
868
869/// Squeeze a single character from a stream — optimized with bulk copy.
870fn squeeze_single_stream(
871    ch: u8,
872    reader: &mut impl Read,
873    writer: &mut impl Write,
874) -> io::Result<()> {
875    let mut inbuf = vec![0u8; STREAM_BUF];
876    let mut outbuf = vec![0u8; STREAM_BUF];
877    let mut was_squeeze_char = false;
878
879    loop {
880        let n = match reader.read(&mut inbuf) {
881            Ok(0) => break,
882            Ok(n) => n,
883            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
884            Err(e) => return Err(e),
885        };
886        let chunk = &inbuf[..n];
887        let mut out_pos = 0;
888        let mut i = 0;
889
890        while i < n {
891            if chunk[i] == ch {
892                if !was_squeeze_char {
893                    // First in run — keep it
894                    unsafe {
895                        *outbuf.get_unchecked_mut(out_pos) = ch;
896                    }
897                    out_pos += 1;
898                    was_squeeze_char = true;
899                }
900                // Skip rest of run
901                i += 1;
902            } else {
903                was_squeeze_char = false;
904                // Find next squeeze char using SIMD memchr
905                let remaining = &chunk[i..];
906                let run_end = memchr::memchr(ch, remaining).unwrap_or(remaining.len());
907                // Bulk copy the non-match run
908                outbuf[out_pos..out_pos + run_end].copy_from_slice(&chunk[i..i + run_end]);
909                out_pos += run_end;
910                i += run_end;
911            }
912        }
913
914        writer.write_all(&outbuf[..out_pos])?;
915    }
916    Ok(())
917}
918
919// ============================================================================
920// Mmap-based functions (zero-copy input from byte slice)
921// ============================================================================
922
923/// Translate bytes from an mmap'd byte slice — zero syscall reads.
924/// Uses SIMD AVX2 for range-delta patterns (e.g., a-z → A-Z).
925/// For large inputs, translates in parallel using rayon for maximum throughput.
926pub fn translate_mmap(
927    set1: &[u8],
928    set2: &[u8],
929    data: &[u8],
930    writer: &mut impl Write,
931) -> io::Result<()> {
932    let table = build_translate_table(set1, set2);
933    let kind = analyze_table(&table);
934    #[cfg(target_arch = "x86_64")]
935    let use_simd = has_avx2();
936    #[cfg(not(target_arch = "x86_64"))]
937    let use_simd = false;
938
939    if matches!(kind, TranslateKind::Identity) {
940        return writer.write_all(data);
941    }
942
943    // Parallel translation for very large inputs — single shared output buffer
944    if data.len() >= PARALLEL_TRANSLATE_THRESHOLD {
945        let mut out = vec![0u8; data.len()];
946        let num_threads = rayon::current_num_threads().max(1);
947        let chunk_size = (data.len() + num_threads - 1) / num_threads;
948        // Align to BUF_SIZE boundaries for cache efficiency
949        let chunk_size = ((chunk_size + BUF_SIZE - 1) / BUF_SIZE) * BUF_SIZE;
950
951        // Translate in parallel into shared buffer — no per-chunk allocation
952        data.par_chunks(chunk_size)
953            .zip(out.par_chunks_mut(chunk_size))
954            .for_each(|(inp, outp)| {
955                translate_chunk_dispatch(inp, &mut outp[..inp.len()], &table, &kind, use_simd);
956            });
957
958        // Single write of entire result
959        writer.write_all(&out)?;
960        return Ok(());
961    }
962
963    // Single-allocation path: translate entire data into one buffer, single write
964    if data.len() <= BUF_SIZE {
965        let mut out = vec![0u8; data.len()];
966        translate_chunk_dispatch(data, &mut out, &table, &kind, use_simd);
967        writer.write_all(&out)?;
968        return Ok(());
969    }
970
971    // Chunked path for larger data — reuses buffer across chunks
972    let mut out = vec![0u8; BUF_SIZE];
973    for chunk in data.chunks(BUF_SIZE) {
974        translate_chunk_dispatch(chunk, &mut out[..chunk.len()], &table, &kind, use_simd);
975        writer.write_all(&out[..chunk.len()])?;
976    }
977    Ok(())
978}
979
980/// Translate + squeeze from mmap'd byte slice.
981/// Uses a two-pass approach for the common case where few bytes are squeezed:
982/// translate first (using SIMD when possible), then squeeze.
983pub fn translate_squeeze_mmap(
984    set1: &[u8],
985    set2: &[u8],
986    data: &[u8],
987    writer: &mut impl Write,
988) -> io::Result<()> {
989    let table = build_translate_table(set1, set2);
990    let squeeze_set = build_member_set(set2);
991    let kind = analyze_table(&table);
992    #[cfg(target_arch = "x86_64")]
993    let use_simd = has_avx2();
994    #[cfg(not(target_arch = "x86_64"))]
995    let use_simd = false;
996
997    // Pre-allocate output buffer for translation
998    let mut trans_buf = vec![0u8; BUF_SIZE];
999    let mut outbuf = vec![0u8; BUF_SIZE];
1000    let mut last_squeezed: u16 = 256;
1001
1002    for chunk in data.chunks(BUF_SIZE) {
1003        // Phase 1: Translate (may use SIMD)
1004        translate_chunk_dispatch(
1005            chunk,
1006            &mut trans_buf[..chunk.len()],
1007            &table,
1008            &kind,
1009            use_simd,
1010        );
1011
1012        // Phase 2: Squeeze
1013        let mut out_pos = 0;
1014        for i in 0..chunk.len() {
1015            let translated = unsafe { *trans_buf.get_unchecked(i) };
1016            if is_member(&squeeze_set, translated) {
1017                if last_squeezed == translated as u16 {
1018                    continue;
1019                }
1020                last_squeezed = translated as u16;
1021            } else {
1022                last_squeezed = 256;
1023            }
1024            unsafe {
1025                *outbuf.get_unchecked_mut(out_pos) = translated;
1026            }
1027            out_pos += 1;
1028        }
1029        writer.write_all(&outbuf[..out_pos])?;
1030    }
1031    Ok(())
1032}
1033
1034/// Delete from mmap'd byte slice.
1035/// Uses SIMD memchr for single-character delete (common case).
1036/// For multi-char delete, uses 8-byte unrolled scan with bitset lookup.
1037pub fn delete_mmap(delete_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1038    // Fast path: single character delete uses SIMD memchr
1039    if delete_chars.len() == 1 {
1040        return delete_single_char_mmap(delete_chars[0], data, writer);
1041    }
1042
1043    // Fast path: 2-char delete uses SIMD memchr2 (bulk copy between matches)
1044    if delete_chars.len() == 2 {
1045        return delete_multi_memchr_mmap::<2>(delete_chars, data, writer);
1046    }
1047
1048    // Fast path: 3-char delete uses SIMD memchr3 (bulk copy between matches)
1049    if delete_chars.len() == 3 {
1050        return delete_multi_memchr_mmap::<3>(delete_chars, data, writer);
1051    }
1052
1053    let member = build_member_set(delete_chars);
1054    let mut outbuf = vec![0u8; BUF_SIZE];
1055
1056    for chunk in data.chunks(BUF_SIZE) {
1057        let mut out_pos = 0;
1058        let len = chunk.len();
1059        let mut i = 0;
1060
1061        // 8-byte unrolled scan for better ILP
1062        while i + 8 <= len {
1063            unsafe {
1064                let b0 = *chunk.get_unchecked(i);
1065                let b1 = *chunk.get_unchecked(i + 1);
1066                let b2 = *chunk.get_unchecked(i + 2);
1067                let b3 = *chunk.get_unchecked(i + 3);
1068                let b4 = *chunk.get_unchecked(i + 4);
1069                let b5 = *chunk.get_unchecked(i + 5);
1070                let b6 = *chunk.get_unchecked(i + 6);
1071                let b7 = *chunk.get_unchecked(i + 7);
1072
1073                if !is_member(&member, b0) {
1074                    *outbuf.get_unchecked_mut(out_pos) = b0;
1075                    out_pos += 1;
1076                }
1077                if !is_member(&member, b1) {
1078                    *outbuf.get_unchecked_mut(out_pos) = b1;
1079                    out_pos += 1;
1080                }
1081                if !is_member(&member, b2) {
1082                    *outbuf.get_unchecked_mut(out_pos) = b2;
1083                    out_pos += 1;
1084                }
1085                if !is_member(&member, b3) {
1086                    *outbuf.get_unchecked_mut(out_pos) = b3;
1087                    out_pos += 1;
1088                }
1089                if !is_member(&member, b4) {
1090                    *outbuf.get_unchecked_mut(out_pos) = b4;
1091                    out_pos += 1;
1092                }
1093                if !is_member(&member, b5) {
1094                    *outbuf.get_unchecked_mut(out_pos) = b5;
1095                    out_pos += 1;
1096                }
1097                if !is_member(&member, b6) {
1098                    *outbuf.get_unchecked_mut(out_pos) = b6;
1099                    out_pos += 1;
1100                }
1101                if !is_member(&member, b7) {
1102                    *outbuf.get_unchecked_mut(out_pos) = b7;
1103                    out_pos += 1;
1104                }
1105            }
1106            i += 8;
1107        }
1108
1109        while i < len {
1110            unsafe {
1111                let b = *chunk.get_unchecked(i);
1112                if !is_member(&member, b) {
1113                    *outbuf.get_unchecked_mut(out_pos) = b;
1114                    out_pos += 1;
1115                }
1116            }
1117            i += 1;
1118        }
1119
1120        writer.write_all(&outbuf[..out_pos])?;
1121    }
1122    Ok(())
1123}
1124
1125/// Multi-character delete (2-3 chars) using SIMD memchr2/memchr3.
1126/// Bulk-copies runs of non-matching bytes, far faster than byte-at-a-time.
1127fn delete_multi_memchr_mmap<const N: usize>(
1128    chars: &[u8],
1129    data: &[u8],
1130    writer: &mut impl Write,
1131) -> io::Result<()> {
1132    let mut last = 0;
1133    if N == 2 {
1134        for pos in memchr::memchr2_iter(chars[0], chars[1], data) {
1135            if pos > last {
1136                writer.write_all(&data[last..pos])?;
1137            }
1138            last = pos + 1;
1139        }
1140    } else {
1141        for pos in memchr::memchr3_iter(chars[0], chars[1], chars[2], data) {
1142            if pos > last {
1143                writer.write_all(&data[last..pos])?;
1144            }
1145            last = pos + 1;
1146        }
1147    }
1148    if last < data.len() {
1149        writer.write_all(&data[last..])?;
1150    }
1151    Ok(())
1152}
1153
1154/// Single-character delete from mmap using SIMD memchr.
1155/// Copies runs of non-matching bytes in bulk (memcpy), far faster than byte-at-a-time.
1156fn delete_single_char_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1157    let mut last = 0;
1158    for pos in memchr::memchr_iter(ch, data) {
1159        if pos > last {
1160            writer.write_all(&data[last..pos])?;
1161        }
1162        last = pos + 1;
1163    }
1164    if last < data.len() {
1165        writer.write_all(&data[last..])?;
1166    }
1167    Ok(())
1168}
1169
1170/// Delete + squeeze from mmap'd byte slice.
1171pub fn delete_squeeze_mmap(
1172    delete_chars: &[u8],
1173    squeeze_chars: &[u8],
1174    data: &[u8],
1175    writer: &mut impl Write,
1176) -> io::Result<()> {
1177    let delete_set = build_member_set(delete_chars);
1178    let squeeze_set = build_member_set(squeeze_chars);
1179    let mut outbuf = vec![0u8; BUF_SIZE];
1180    let mut last_squeezed: u16 = 256;
1181
1182    for chunk in data.chunks(BUF_SIZE) {
1183        let mut out_pos = 0;
1184        for &b in chunk {
1185            if is_member(&delete_set, b) {
1186                continue;
1187            }
1188            if is_member(&squeeze_set, b) {
1189                if last_squeezed == b as u16 {
1190                    continue;
1191                }
1192                last_squeezed = b as u16;
1193            } else {
1194                last_squeezed = 256;
1195            }
1196            unsafe {
1197                *outbuf.get_unchecked_mut(out_pos) = b;
1198            }
1199            out_pos += 1;
1200        }
1201        writer.write_all(&outbuf[..out_pos])?;
1202    }
1203    Ok(())
1204}
1205
1206/// Squeeze from mmap'd byte slice.
1207/// Uses a two-pass approach: find runs of squeezable bytes with memchr,
1208/// then copy non-squeezed content in bulk.
1209pub fn squeeze_mmap(squeeze_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1210    // Fast path: single squeeze character — use SIMD memchr to find runs
1211    if squeeze_chars.len() == 1 {
1212        return squeeze_single_mmap(squeeze_chars[0], data, writer);
1213    }
1214
1215    // Fast path: 2-3 squeeze chars — use memchr2/memchr3 for SIMD scanning
1216    if squeeze_chars.len() == 2 {
1217        return squeeze_multi_mmap::<2>(squeeze_chars, data, writer);
1218    }
1219    if squeeze_chars.len() == 3 {
1220        return squeeze_multi_mmap::<3>(squeeze_chars, data, writer);
1221    }
1222
1223    // General path: 8-byte unrolled member check with bulk non-member span copy
1224    let member = build_member_set(squeeze_chars);
1225    let mut last_squeezed: u16 = 256;
1226    let mut span_start = 0; // start of current non-squeezed span in data
1227    let len = data.len();
1228    let mut i = 0;
1229
1230    while i < len {
1231        let b = unsafe { *data.get_unchecked(i) };
1232        if is_member(&member, b) {
1233            // Write non-member span before this member byte
1234            if i > span_start {
1235                writer.write_all(&data[span_start..i])?;
1236            }
1237            if last_squeezed != b as u16 {
1238                // First occurrence of this member — write it
1239                writer.write_all(&[b])?;
1240                last_squeezed = b as u16;
1241            }
1242            i += 1;
1243            // Skip consecutive duplicates of same byte
1244            while i < len && data[i] == b {
1245                i += 1;
1246            }
1247            span_start = i;
1248        } else {
1249            last_squeezed = 256;
1250            i += 1;
1251        }
1252    }
1253
1254    // Write remaining non-member span
1255    if span_start < len {
1256        writer.write_all(&data[span_start..])?;
1257    }
1258    Ok(())
1259}
1260
1261/// Squeeze with 2-3 char sets using SIMD memchr2/memchr3 for fast scanning.
1262/// Writes non-member spans directly from mmap (zero-copy).
1263fn squeeze_multi_mmap<const N: usize>(
1264    chars: &[u8],
1265    data: &[u8],
1266    writer: &mut impl Write,
1267) -> io::Result<()> {
1268    let mut last_squeezed: u16 = 256;
1269    let mut cursor = 0;
1270
1271    macro_rules! find_next {
1272        ($data:expr) => {
1273            if N == 2 {
1274                memchr::memchr2(chars[0], chars[1], $data)
1275            } else {
1276                memchr::memchr3(chars[0], chars[1], chars[2], $data)
1277            }
1278        };
1279    }
1280
1281    while cursor < data.len() {
1282        match find_next!(&data[cursor..]) {
1283            Some(offset) => {
1284                let pos = cursor + offset;
1285                let b = data[pos];
1286                // Write non-member span
1287                if pos > cursor {
1288                    writer.write_all(&data[cursor..pos])?;
1289                    last_squeezed = 256;
1290                }
1291                if last_squeezed != b as u16 {
1292                    writer.write_all(&[b])?;
1293                    last_squeezed = b as u16;
1294                }
1295                // Skip consecutive duplicates of same byte
1296                let mut skip = pos + 1;
1297                while skip < data.len() && data[skip] == b {
1298                    skip += 1;
1299                }
1300                cursor = skip;
1301            }
1302            None => {
1303                writer.write_all(&data[cursor..])?;
1304                break;
1305            }
1306        }
1307    }
1308    Ok(())
1309}
1310
1311/// Squeeze a single repeated character from mmap'd data.
1312/// Zero-copy: writes directly from mmap data, no intermediate buffer.
1313/// Uses SIMD memchr for bulk-skip between occurrences of the squeeze char.
1314fn squeeze_single_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1315    let mut cursor = 0;
1316
1317    while cursor < data.len() {
1318        // Find next occurrence of squeeze char using SIMD memchr
1319        match memchr::memchr(ch, &data[cursor..]) {
1320            Some(offset) => {
1321                let pos = cursor + offset;
1322                // Write everything from cursor to pos + 1 (including first occurrence)
1323                writer.write_all(&data[cursor..pos + 1])?;
1324                // Skip consecutive duplicates
1325                let mut skip = pos + 1;
1326                while skip < data.len() && data[skip] == ch {
1327                    skip += 1;
1328                }
1329                cursor = skip;
1330            }
1331            None => {
1332                // No more occurrences — write remaining data
1333                writer.write_all(&data[cursor..])?;
1334                break;
1335            }
1336        }
1337    }
1338    Ok(())
1339}