Skip to main content

coreutils_rs/tr/
core.rs

1use std::io::{self, Read, Write};
2
3const BUF_SIZE: usize = 1024 * 1024; // 1MB — fits L2/L3 cache for locality
4
5/// Stream buffer: 256KB — process data immediately after each read().
6/// Stays in L2 cache, matches typical kernel pipe buffer size.
7/// Unlike fill_buf (which loops to accumulate 8MB = ~128 syscalls for pipes),
8/// we read once and process immediately, matching GNU tr's approach.
9const STREAM_BUF: usize = 256 * 1024;
10
11/// Build a 256-byte lookup table mapping set1[i] -> set2[i].
12#[inline]
13fn build_translate_table(set1: &[u8], set2: &[u8]) -> [u8; 256] {
14    let mut table: [u8; 256] = std::array::from_fn(|i| i as u8);
15    let last = set2.last().copied();
16    for (i, &from) in set1.iter().enumerate() {
17        table[from as usize] = if i < set2.len() {
18            set2[i]
19        } else {
20            last.unwrap_or(from)
21        };
22    }
23    table
24}
25
26/// Build a 256-bit (32-byte) membership set for O(1) byte lookup.
27#[inline]
28fn build_member_set(chars: &[u8]) -> [u8; 32] {
29    let mut set = [0u8; 32];
30    for &ch in chars {
31        set[ch as usize >> 3] |= 1 << (ch & 7);
32    }
33    set
34}
35
36#[inline(always)]
37fn is_member(set: &[u8; 32], ch: u8) -> bool {
38    unsafe { (*set.get_unchecked(ch as usize >> 3) & (1 << (ch & 7))) != 0 }
39}
40
41// ============================================================================
42// Table analysis for SIMD fast paths
43// ============================================================================
44
45/// Classification of a translation table for SIMD optimization.
46#[allow(dead_code)] // Fields used only on x86_64 (AVX2 SIMD path)
47enum TranslateKind {
48    /// Table is identity — no translation needed.
49    Identity,
50    /// A contiguous range [lo, hi] with constant wrapping-add delta; identity elsewhere.
51    RangeDelta { lo: u8, hi: u8, delta: u8 },
52    /// Arbitrary translation — use general lookup table.
53    General,
54}
55
56/// Analyze a translation table to detect SIMD-optimizable patterns.
57fn analyze_table(table: &[u8; 256]) -> TranslateKind {
58    let mut first_delta: Option<u8> = None;
59    let mut lo: u8 = 0;
60    let mut hi: u8 = 0;
61    let mut count: u32 = 0;
62
63    for i in 0..256u16 {
64        let actual = table[i as usize];
65        if actual != i as u8 {
66            let delta = actual.wrapping_sub(i as u8);
67            count += 1;
68            match first_delta {
69                None => {
70                    first_delta = Some(delta);
71                    lo = i as u8;
72                    hi = i as u8;
73                }
74                Some(d) if d == delta => {
75                    hi = i as u8;
76                }
77                _ => return TranslateKind::General,
78            }
79        }
80    }
81
82    match (count, first_delta) {
83        (0, _) => TranslateKind::Identity,
84        (c, Some(delta)) if c == (hi as u32 - lo as u32 + 1) => {
85            TranslateKind::RangeDelta { lo, hi, delta }
86        }
87        _ => TranslateKind::General,
88    }
89}
90
91// ============================================================================
92// SIMD translation (x86_64 AVX2)
93// ============================================================================
94
95#[cfg(target_arch = "x86_64")]
96mod simd_tr {
97    /// Translate bytes in range [lo, hi] by adding `delta` (wrapping), leave others unchanged.
98    /// Processes 128 bytes per iteration (4x unroll) using AVX2.
99    ///
100    /// SAFETY: Caller must ensure AVX2 is available and out.len() >= data.len().
101    #[target_feature(enable = "avx2")]
102    pub unsafe fn range_delta(data: &[u8], out: &mut [u8], lo: u8, hi: u8, delta: u8) {
103        unsafe {
104            use std::arch::x86_64::*;
105
106            let lo_vec = _mm256_set1_epi8(lo as i8);
107            let range_vec = _mm256_set1_epi8((hi - lo) as i8);
108            let delta_vec = _mm256_set1_epi8(delta as i8);
109
110            let len = data.len();
111            let inp = data.as_ptr();
112            let outp = out.as_mut_ptr();
113            let mut i = 0usize;
114
115            // 4x unrolled: process 128 bytes per iteration for better ILP
116            while i + 128 <= len {
117                let v0 = _mm256_loadu_si256(inp.add(i) as *const __m256i);
118                let v1 = _mm256_loadu_si256(inp.add(i + 32) as *const __m256i);
119                let v2 = _mm256_loadu_si256(inp.add(i + 64) as *const __m256i);
120                let v3 = _mm256_loadu_si256(inp.add(i + 96) as *const __m256i);
121
122                let d0 = _mm256_sub_epi8(v0, lo_vec);
123                let d1 = _mm256_sub_epi8(v1, lo_vec);
124                let d2 = _mm256_sub_epi8(v2, lo_vec);
125                let d3 = _mm256_sub_epi8(v3, lo_vec);
126
127                let m0 = _mm256_cmpeq_epi8(_mm256_min_epu8(d0, range_vec), d0);
128                let m1 = _mm256_cmpeq_epi8(_mm256_min_epu8(d1, range_vec), d1);
129                let m2 = _mm256_cmpeq_epi8(_mm256_min_epu8(d2, range_vec), d2);
130                let m3 = _mm256_cmpeq_epi8(_mm256_min_epu8(d3, range_vec), d3);
131
132                let r0 = _mm256_add_epi8(v0, _mm256_and_si256(m0, delta_vec));
133                let r1 = _mm256_add_epi8(v1, _mm256_and_si256(m1, delta_vec));
134                let r2 = _mm256_add_epi8(v2, _mm256_and_si256(m2, delta_vec));
135                let r3 = _mm256_add_epi8(v3, _mm256_and_si256(m3, delta_vec));
136
137                _mm256_storeu_si256(outp.add(i) as *mut __m256i, r0);
138                _mm256_storeu_si256(outp.add(i + 32) as *mut __m256i, r1);
139                _mm256_storeu_si256(outp.add(i + 64) as *mut __m256i, r2);
140                _mm256_storeu_si256(outp.add(i + 96) as *mut __m256i, r3);
141                i += 128;
142            }
143
144            while i + 32 <= len {
145                let v = _mm256_loadu_si256(inp.add(i) as *const __m256i);
146                let diff = _mm256_sub_epi8(v, lo_vec);
147                let mask = _mm256_cmpeq_epi8(_mm256_min_epu8(diff, range_vec), diff);
148                let result = _mm256_add_epi8(v, _mm256_and_si256(mask, delta_vec));
149                _mm256_storeu_si256(outp.add(i) as *mut __m256i, result);
150                i += 32;
151            }
152
153            while i < len {
154                let b = *inp.add(i);
155                *outp.add(i) = if b.wrapping_sub(lo) <= (hi - lo) {
156                    b.wrapping_add(delta)
157                } else {
158                    b
159                };
160                i += 1;
161            }
162        }
163    }
164
165    /// General 256-byte table lookup using 16-way vpshufb (nibble decomposition).
166    /// Splits each input byte into high nibble (selects one of 16 shuffle tables)
167    /// and low nibble (index within the shuffle table). Processes 64 bytes per
168    /// iteration (2x unrolled) for instruction-level parallelism.
169    ///
170    /// SAFETY: Caller must ensure AVX2 is available and out.len() >= data.len().
171    #[target_feature(enable = "avx2")]
172    pub unsafe fn general_lookup(data: &[u8], out: &mut [u8], table: &[u8; 256]) {
173        unsafe {
174            use std::arch::x86_64::*;
175
176            // Build 16 vpshufb LUTs from the 256-byte translation table.
177            // LUT[h] covers table[h*16..h*16+16], broadcast to both 128-bit lanes.
178            let tp = table.as_ptr();
179            let mut luts = [_mm256_setzero_si256(); 16];
180            let mut h = 0;
181            while h < 16 {
182                luts[h] =
183                    _mm256_broadcastsi128_si256(_mm_loadu_si128(tp.add(h * 16) as *const __m128i));
184                h += 1;
185            }
186
187            let lo_mask = _mm256_set1_epi8(0x0F);
188            let len = data.len();
189            let inp = data.as_ptr();
190            let outp = out.as_mut_ptr();
191            let mut i = 0;
192
193            // 2x unrolled: process 64 bytes per iteration
194            while i + 64 <= len {
195                let v0 = _mm256_loadu_si256(inp.add(i) as *const __m256i);
196                let v1 = _mm256_loadu_si256(inp.add(i + 32) as *const __m256i);
197
198                let lo0 = _mm256_and_si256(v0, lo_mask);
199                let lo1 = _mm256_and_si256(v1, lo_mask);
200                let hi0 = _mm256_and_si256(_mm256_srli_epi16(v0, 4), lo_mask);
201                let hi1 = _mm256_and_si256(_mm256_srli_epi16(v1, 4), lo_mask);
202
203                let mut r0 = _mm256_setzero_si256();
204                let mut r1 = _mm256_setzero_si256();
205
206                // Process all 16 high-nibble values.
207                // For each h, bytes where high_nibble == h get their result from luts[h].
208                macro_rules! do_nib {
209                    ($h:literal) => {
210                        let hv = _mm256_set1_epi8($h);
211                        let lut = luts[$h as usize];
212                        let m0 = _mm256_cmpeq_epi8(hi0, hv);
213                        let m1 = _mm256_cmpeq_epi8(hi1, hv);
214                        r0 = _mm256_or_si256(
215                            r0,
216                            _mm256_and_si256(_mm256_shuffle_epi8(lut, lo0), m0),
217                        );
218                        r1 = _mm256_or_si256(
219                            r1,
220                            _mm256_and_si256(_mm256_shuffle_epi8(lut, lo1), m1),
221                        );
222                    };
223                }
224
225                do_nib!(0);
226                do_nib!(1);
227                do_nib!(2);
228                do_nib!(3);
229                do_nib!(4);
230                do_nib!(5);
231                do_nib!(6);
232                do_nib!(7);
233                do_nib!(8);
234                do_nib!(9);
235                do_nib!(10);
236                do_nib!(11);
237                do_nib!(12);
238                do_nib!(13);
239                do_nib!(14);
240                do_nib!(15);
241
242                _mm256_storeu_si256(outp.add(i) as *mut __m256i, r0);
243                _mm256_storeu_si256(outp.add(i + 32) as *mut __m256i, r1);
244                i += 64;
245            }
246
247            // Single vector tail
248            while i + 32 <= len {
249                let v = _mm256_loadu_si256(inp.add(i) as *const __m256i);
250                let lo = _mm256_and_si256(v, lo_mask);
251                let hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), lo_mask);
252
253                let mut result = _mm256_setzero_si256();
254                let mut hh = 0u8;
255                while hh < 16 {
256                    let hv = _mm256_set1_epi8(hh as i8);
257                    let m = _mm256_cmpeq_epi8(hi, hv);
258                    result = _mm256_or_si256(
259                        result,
260                        _mm256_and_si256(_mm256_shuffle_epi8(luts[hh as usize], lo), m),
261                    );
262                    hh += 1;
263                }
264
265                _mm256_storeu_si256(outp.add(i) as *mut __m256i, result);
266                i += 32;
267            }
268
269            // Scalar tail
270            while i < len {
271                *outp.add(i) = *table.get_unchecked(*inp.add(i) as usize);
272                i += 1;
273            }
274        }
275    }
276
277    /// In-place general 256-byte table lookup using 16-way vpshufb.
278    ///
279    /// SAFETY: Caller must ensure AVX2 is available.
280    #[target_feature(enable = "avx2")]
281    pub unsafe fn general_lookup_inplace(data: &mut [u8], table: &[u8; 256]) {
282        unsafe {
283            use std::arch::x86_64::*;
284
285            let tp = table.as_ptr();
286            let mut luts = [_mm256_setzero_si256(); 16];
287            let mut h = 0;
288            while h < 16 {
289                luts[h] =
290                    _mm256_broadcastsi128_si256(_mm_loadu_si128(tp.add(h * 16) as *const __m128i));
291                h += 1;
292            }
293
294            let lo_mask = _mm256_set1_epi8(0x0F);
295            let len = data.len();
296            let ptr = data.as_mut_ptr();
297            let mut i = 0;
298
299            while i + 64 <= len {
300                let v0 = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
301                let v1 = _mm256_loadu_si256(ptr.add(i + 32) as *const __m256i);
302
303                let lo0 = _mm256_and_si256(v0, lo_mask);
304                let lo1 = _mm256_and_si256(v1, lo_mask);
305                let hi0 = _mm256_and_si256(_mm256_srli_epi16(v0, 4), lo_mask);
306                let hi1 = _mm256_and_si256(_mm256_srli_epi16(v1, 4), lo_mask);
307
308                let mut r0 = _mm256_setzero_si256();
309                let mut r1 = _mm256_setzero_si256();
310
311                macro_rules! do_nib {
312                    ($h:literal) => {
313                        let hv = _mm256_set1_epi8($h);
314                        let lut = luts[$h as usize];
315                        let m0 = _mm256_cmpeq_epi8(hi0, hv);
316                        let m1 = _mm256_cmpeq_epi8(hi1, hv);
317                        r0 = _mm256_or_si256(
318                            r0,
319                            _mm256_and_si256(_mm256_shuffle_epi8(lut, lo0), m0),
320                        );
321                        r1 = _mm256_or_si256(
322                            r1,
323                            _mm256_and_si256(_mm256_shuffle_epi8(lut, lo1), m1),
324                        );
325                    };
326                }
327
328                do_nib!(0);
329                do_nib!(1);
330                do_nib!(2);
331                do_nib!(3);
332                do_nib!(4);
333                do_nib!(5);
334                do_nib!(6);
335                do_nib!(7);
336                do_nib!(8);
337                do_nib!(9);
338                do_nib!(10);
339                do_nib!(11);
340                do_nib!(12);
341                do_nib!(13);
342                do_nib!(14);
343                do_nib!(15);
344
345                _mm256_storeu_si256(ptr.add(i) as *mut __m256i, r0);
346                _mm256_storeu_si256(ptr.add(i + 32) as *mut __m256i, r1);
347                i += 64;
348            }
349
350            while i + 32 <= len {
351                let v = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
352                let lo = _mm256_and_si256(v, lo_mask);
353                let hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), lo_mask);
354
355                let mut result = _mm256_setzero_si256();
356                let mut hh = 0u8;
357                while hh < 16 {
358                    let hv = _mm256_set1_epi8(hh as i8);
359                    let m = _mm256_cmpeq_epi8(hi, hv);
360                    result = _mm256_or_si256(
361                        result,
362                        _mm256_and_si256(_mm256_shuffle_epi8(luts[hh as usize], lo), m),
363                    );
364                    hh += 1;
365                }
366
367                _mm256_storeu_si256(ptr.add(i) as *mut __m256i, result);
368                i += 32;
369            }
370
371            while i < len {
372                let b = *ptr.add(i);
373                *ptr.add(i) = *table.get_unchecked(b as usize);
374                i += 1;
375            }
376        }
377    }
378
379    /// In-place SIMD translate for stdin path.
380    ///
381    /// SAFETY: Caller must ensure AVX2 is available.
382    #[target_feature(enable = "avx2")]
383    pub unsafe fn range_delta_inplace(data: &mut [u8], lo: u8, hi: u8, delta: u8) {
384        unsafe {
385            use std::arch::x86_64::*;
386
387            let lo_vec = _mm256_set1_epi8(lo as i8);
388            let range_vec = _mm256_set1_epi8((hi - lo) as i8);
389            let delta_vec = _mm256_set1_epi8(delta as i8);
390
391            let len = data.len();
392            let ptr = data.as_mut_ptr();
393            let mut i = 0usize;
394
395            while i + 128 <= len {
396                let v0 = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
397                let v1 = _mm256_loadu_si256(ptr.add(i + 32) as *const __m256i);
398                let v2 = _mm256_loadu_si256(ptr.add(i + 64) as *const __m256i);
399                let v3 = _mm256_loadu_si256(ptr.add(i + 96) as *const __m256i);
400
401                let d0 = _mm256_sub_epi8(v0, lo_vec);
402                let d1 = _mm256_sub_epi8(v1, lo_vec);
403                let d2 = _mm256_sub_epi8(v2, lo_vec);
404                let d3 = _mm256_sub_epi8(v3, lo_vec);
405
406                let m0 = _mm256_cmpeq_epi8(_mm256_min_epu8(d0, range_vec), d0);
407                let m1 = _mm256_cmpeq_epi8(_mm256_min_epu8(d1, range_vec), d1);
408                let m2 = _mm256_cmpeq_epi8(_mm256_min_epu8(d2, range_vec), d2);
409                let m3 = _mm256_cmpeq_epi8(_mm256_min_epu8(d3, range_vec), d3);
410
411                let r0 = _mm256_add_epi8(v0, _mm256_and_si256(m0, delta_vec));
412                let r1 = _mm256_add_epi8(v1, _mm256_and_si256(m1, delta_vec));
413                let r2 = _mm256_add_epi8(v2, _mm256_and_si256(m2, delta_vec));
414                let r3 = _mm256_add_epi8(v3, _mm256_and_si256(m3, delta_vec));
415
416                _mm256_storeu_si256(ptr.add(i) as *mut __m256i, r0);
417                _mm256_storeu_si256(ptr.add(i + 32) as *mut __m256i, r1);
418                _mm256_storeu_si256(ptr.add(i + 64) as *mut __m256i, r2);
419                _mm256_storeu_si256(ptr.add(i + 96) as *mut __m256i, r3);
420                i += 128;
421            }
422
423            while i + 32 <= len {
424                let v = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
425                let diff = _mm256_sub_epi8(v, lo_vec);
426                let mask = _mm256_cmpeq_epi8(_mm256_min_epu8(diff, range_vec), diff);
427                let result = _mm256_add_epi8(v, _mm256_and_si256(mask, delta_vec));
428                _mm256_storeu_si256(ptr.add(i) as *mut __m256i, result);
429                i += 32;
430            }
431
432            while i < len {
433                let b = *ptr.add(i);
434                *ptr.add(i) = if b.wrapping_sub(lo) <= (hi - lo) {
435                    b.wrapping_add(delta)
436                } else {
437                    b
438                };
439                i += 1;
440            }
441        }
442    }
443}
444
445/// AVX2 nibble-based set membership classifier.
446/// Uses vpshufb to test 32 bytes at a time for membership in a byte set.
447/// Returns a `__m256i` where each byte is 0xFF if the byte is NOT in the set, 0x00 if it IS.
448/// Check if AVX2 is available at runtime.
449#[cfg(target_arch = "x86_64")]
450#[inline(always)]
451fn has_avx2() -> bool {
452    is_x86_feature_detected!("avx2")
453}
454
455/// Translate a chunk of bytes using a lookup table — unrolled 8-byte inner loop.
456#[inline(always)]
457fn translate_chunk(chunk: &[u8], out: &mut [u8], table: &[u8; 256]) {
458    let len = chunk.len();
459    let mut i = 0;
460    while i + 8 <= len {
461        unsafe {
462            *out.get_unchecked_mut(i) = *table.get_unchecked(*chunk.get_unchecked(i) as usize);
463            *out.get_unchecked_mut(i + 1) =
464                *table.get_unchecked(*chunk.get_unchecked(i + 1) as usize);
465            *out.get_unchecked_mut(i + 2) =
466                *table.get_unchecked(*chunk.get_unchecked(i + 2) as usize);
467            *out.get_unchecked_mut(i + 3) =
468                *table.get_unchecked(*chunk.get_unchecked(i + 3) as usize);
469            *out.get_unchecked_mut(i + 4) =
470                *table.get_unchecked(*chunk.get_unchecked(i + 4) as usize);
471            *out.get_unchecked_mut(i + 5) =
472                *table.get_unchecked(*chunk.get_unchecked(i + 5) as usize);
473            *out.get_unchecked_mut(i + 6) =
474                *table.get_unchecked(*chunk.get_unchecked(i + 6) as usize);
475            *out.get_unchecked_mut(i + 7) =
476                *table.get_unchecked(*chunk.get_unchecked(i + 7) as usize);
477        }
478        i += 8;
479    }
480    while i < len {
481        unsafe {
482            *out.get_unchecked_mut(i) = *table.get_unchecked(*chunk.get_unchecked(i) as usize);
483        }
484        i += 1;
485    }
486}
487
488/// In-place translate for stdin path — avoids separate output buffer.
489#[inline(always)]
490fn translate_inplace(data: &mut [u8], table: &[u8; 256]) {
491    let len = data.len();
492    let ptr = data.as_mut_ptr();
493    let tab = table.as_ptr();
494
495    unsafe {
496        let mut i = 0;
497        while i + 8 <= len {
498            *ptr.add(i) = *tab.add(*ptr.add(i) as usize);
499            *ptr.add(i + 1) = *tab.add(*ptr.add(i + 1) as usize);
500            *ptr.add(i + 2) = *tab.add(*ptr.add(i + 2) as usize);
501            *ptr.add(i + 3) = *tab.add(*ptr.add(i + 3) as usize);
502            *ptr.add(i + 4) = *tab.add(*ptr.add(i + 4) as usize);
503            *ptr.add(i + 5) = *tab.add(*ptr.add(i + 5) as usize);
504            *ptr.add(i + 6) = *tab.add(*ptr.add(i + 6) as usize);
505            *ptr.add(i + 7) = *tab.add(*ptr.add(i + 7) as usize);
506            i += 8;
507        }
508        while i < len {
509            *ptr.add(i) = *tab.add(*ptr.add(i) as usize);
510            i += 1;
511        }
512    }
513}
514
515// ============================================================================
516// Dispatch: choose SIMD or scalar based on table analysis
517// ============================================================================
518
519/// Translate a chunk using the best available method.
520/// `use_simd` is cached at call site to avoid per-chunk atomic loads.
521#[inline]
522fn translate_chunk_dispatch(
523    chunk: &[u8],
524    out: &mut [u8],
525    table: &[u8; 256],
526    kind: &TranslateKind,
527    _use_simd: bool,
528) {
529    match kind {
530        TranslateKind::Identity => {
531            out[..chunk.len()].copy_from_slice(chunk);
532        }
533        #[cfg(target_arch = "x86_64")]
534        TranslateKind::RangeDelta { lo, hi, delta } => {
535            if _use_simd {
536                unsafe { simd_tr::range_delta(chunk, out, *lo, *hi, *delta) };
537                return;
538            }
539            translate_chunk(chunk, out, table);
540        }
541        #[cfg(not(target_arch = "x86_64"))]
542        TranslateKind::RangeDelta { .. } => {
543            translate_chunk(chunk, out, table);
544        }
545        #[cfg(target_arch = "x86_64")]
546        TranslateKind::General => {
547            if _use_simd {
548                unsafe { simd_tr::general_lookup(chunk, out, table) };
549                return;
550            }
551            translate_chunk(chunk, out, table);
552        }
553        #[cfg(not(target_arch = "x86_64"))]
554        TranslateKind::General => {
555            translate_chunk(chunk, out, table);
556        }
557    }
558}
559
560/// In-place translate dispatch.
561/// `use_simd` is cached at call site to avoid per-chunk atomic loads.
562#[inline]
563fn translate_inplace_dispatch(
564    data: &mut [u8],
565    table: &[u8; 256],
566    kind: &TranslateKind,
567    _use_simd: bool,
568) {
569    match kind {
570        TranslateKind::Identity => {}
571        #[cfg(target_arch = "x86_64")]
572        TranslateKind::RangeDelta { lo, hi, delta } => {
573            if _use_simd {
574                unsafe { simd_tr::range_delta_inplace(data, *lo, *hi, *delta) };
575                return;
576            }
577            translate_inplace(data, table);
578        }
579        #[cfg(not(target_arch = "x86_64"))]
580        TranslateKind::RangeDelta { .. } => {
581            translate_inplace(data, table);
582        }
583        #[cfg(target_arch = "x86_64")]
584        TranslateKind::General => {
585            if _use_simd {
586                unsafe { simd_tr::general_lookup_inplace(data, table) };
587                return;
588            }
589            translate_inplace(data, table);
590        }
591        #[cfg(not(target_arch = "x86_64"))]
592        TranslateKind::General => {
593            translate_inplace(data, table);
594        }
595    }
596}
597
598// ============================================================================
599// Streaming functions (Read + Write)
600// Process data immediately after each read() — no fill_buf accumulation.
601// Uses 256KB buffer (L2-friendly) instead of 8MB.
602// ============================================================================
603
604pub fn translate(
605    set1: &[u8],
606    set2: &[u8],
607    reader: &mut impl Read,
608    writer: &mut impl Write,
609) -> io::Result<()> {
610    let table = build_translate_table(set1, set2);
611    let kind = analyze_table(&table);
612    #[cfg(target_arch = "x86_64")]
613    let use_simd = has_avx2();
614    #[cfg(not(target_arch = "x86_64"))]
615    let use_simd = false;
616
617    // Use 1MB buffer for fewer read() syscalls on pipes
618    let mut buf = vec![0u8; BUF_SIZE];
619    loop {
620        let n = match reader.read(&mut buf) {
621            Ok(0) => break,
622            Ok(n) => n,
623            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
624            Err(e) => return Err(e),
625        };
626        translate_inplace_dispatch(&mut buf[..n], &table, &kind, use_simd);
627        writer.write_all(&buf[..n])?;
628    }
629    Ok(())
630}
631
632pub fn translate_squeeze(
633    set1: &[u8],
634    set2: &[u8],
635    reader: &mut impl Read,
636    writer: &mut impl Write,
637) -> io::Result<()> {
638    let table = build_translate_table(set1, set2);
639    let squeeze_set = build_member_set(set2);
640    let kind = analyze_table(&table);
641    #[cfg(target_arch = "x86_64")]
642    let use_simd = has_avx2();
643    #[cfg(not(target_arch = "x86_64"))]
644    let use_simd = false;
645
646    // Single buffer: SIMD translate in-place, then squeeze in-place compaction.
647    // Eliminates outbuf allocation and saves one full memcpy of data.
648    let mut buf = vec![0u8; STREAM_BUF];
649    let mut last_squeezed: u16 = 256;
650
651    loop {
652        let n = match reader.read(&mut buf) {
653            Ok(0) => break,
654            Ok(n) => n,
655            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
656            Err(e) => return Err(e),
657        };
658        // Phase 1: SIMD translate in-place (uses AVX2 when available)
659        translate_inplace_dispatch(&mut buf[..n], &table, &kind, use_simd);
660        // Phase 2: squeeze in-place compaction (wp <= i always, safe)
661        let mut wp = 0;
662        unsafe {
663            let ptr = buf.as_mut_ptr();
664            for i in 0..n {
665                let b = *ptr.add(i);
666                if is_member(&squeeze_set, b) {
667                    if last_squeezed == b as u16 {
668                        continue;
669                    }
670                    last_squeezed = b as u16;
671                } else {
672                    last_squeezed = 256;
673                }
674                *ptr.add(wp) = b;
675                wp += 1;
676            }
677        }
678        writer.write_all(&buf[..wp])?;
679    }
680    Ok(())
681}
682
683pub fn delete(
684    delete_chars: &[u8],
685    reader: &mut impl Read,
686    writer: &mut impl Write,
687) -> io::Result<()> {
688    // Fast path: single character delete using SIMD memchr
689    if delete_chars.len() == 1 {
690        return delete_single_streaming(delete_chars[0], reader, writer);
691    }
692
693    // Fast paths: 2-3 char delete using SIMD memchr2/memchr3
694    if delete_chars.len() <= 3 {
695        return delete_multi_streaming(delete_chars, reader, writer);
696    }
697
698    let member = build_member_set(delete_chars);
699    // Single buffer with in-place compaction — eliminates outbuf allocation + memcpy
700    let mut buf = vec![0u8; STREAM_BUF];
701
702    loop {
703        let n = match reader.read(&mut buf) {
704            Ok(0) => break,
705            Ok(n) => n,
706            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
707            Err(e) => return Err(e),
708        };
709        let mut wp = 0;
710        unsafe {
711            let ptr = buf.as_mut_ptr();
712            let mut i = 0;
713            // 8-byte unrolled in-place compaction
714            while i + 8 <= n {
715                let b0 = *ptr.add(i);
716                let b1 = *ptr.add(i + 1);
717                let b2 = *ptr.add(i + 2);
718                let b3 = *ptr.add(i + 3);
719                let b4 = *ptr.add(i + 4);
720                let b5 = *ptr.add(i + 5);
721                let b6 = *ptr.add(i + 6);
722                let b7 = *ptr.add(i + 7);
723
724                if !is_member(&member, b0) {
725                    *ptr.add(wp) = b0;
726                    wp += 1;
727                }
728                if !is_member(&member, b1) {
729                    *ptr.add(wp) = b1;
730                    wp += 1;
731                }
732                if !is_member(&member, b2) {
733                    *ptr.add(wp) = b2;
734                    wp += 1;
735                }
736                if !is_member(&member, b3) {
737                    *ptr.add(wp) = b3;
738                    wp += 1;
739                }
740                if !is_member(&member, b4) {
741                    *ptr.add(wp) = b4;
742                    wp += 1;
743                }
744                if !is_member(&member, b5) {
745                    *ptr.add(wp) = b5;
746                    wp += 1;
747                }
748                if !is_member(&member, b6) {
749                    *ptr.add(wp) = b6;
750                    wp += 1;
751                }
752                if !is_member(&member, b7) {
753                    *ptr.add(wp) = b7;
754                    wp += 1;
755                }
756                i += 8;
757            }
758            while i < n {
759                let b = *ptr.add(i);
760                if !is_member(&member, b) {
761                    *ptr.add(wp) = b;
762                    wp += 1;
763                }
764                i += 1;
765            }
766        }
767        writer.write_all(&buf[..wp])?;
768    }
769    Ok(())
770}
771
772/// Single-character delete from a reader — in-place compaction with SIMD memchr.
773/// Uses memchr for SIMD scanning + ptr::copy for in-place shift + single write_all.
774fn delete_single_streaming(
775    ch: u8,
776    reader: &mut impl Read,
777    writer: &mut impl Write,
778) -> io::Result<()> {
779    let mut buf = vec![0u8; STREAM_BUF];
780    loop {
781        let n = match reader.read(&mut buf) {
782            Ok(0) => break,
783            Ok(n) => n,
784            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
785            Err(e) => return Err(e),
786        };
787        let mut wp = 0;
788        let mut i = 0;
789        while i < n {
790            match memchr::memchr(ch, &buf[i..n]) {
791                Some(offset) => {
792                    if offset > 0 {
793                        if wp != i {
794                            unsafe {
795                                std::ptr::copy(
796                                    buf.as_ptr().add(i),
797                                    buf.as_mut_ptr().add(wp),
798                                    offset,
799                                );
800                            }
801                        }
802                        wp += offset;
803                    }
804                    i += offset + 1; // skip the deleted char
805                }
806                None => {
807                    let run_len = n - i;
808                    if run_len > 0 {
809                        if wp != i {
810                            unsafe {
811                                std::ptr::copy(
812                                    buf.as_ptr().add(i),
813                                    buf.as_mut_ptr().add(wp),
814                                    run_len,
815                                );
816                            }
817                        }
818                        wp += run_len;
819                    }
820                    break;
821                }
822            }
823        }
824        writer.write_all(&buf[..wp])?;
825    }
826    Ok(())
827}
828
829/// Multi-character delete (2-3 chars) — in-place compaction with SIMD memchr2/memchr3.
830fn delete_multi_streaming(
831    chars: &[u8],
832    reader: &mut impl Read,
833    writer: &mut impl Write,
834) -> io::Result<()> {
835    let mut buf = vec![0u8; STREAM_BUF];
836    loop {
837        let n = match reader.read(&mut buf) {
838            Ok(0) => break,
839            Ok(n) => n,
840            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
841            Err(e) => return Err(e),
842        };
843        let mut wp = 0;
844        let mut i = 0;
845        while i < n {
846            let found = if chars.len() == 2 {
847                memchr::memchr2(chars[0], chars[1], &buf[i..n])
848            } else {
849                memchr::memchr3(chars[0], chars[1], chars[2], &buf[i..n])
850            };
851            match found {
852                Some(offset) => {
853                    if offset > 0 {
854                        if wp != i {
855                            unsafe {
856                                std::ptr::copy(
857                                    buf.as_ptr().add(i),
858                                    buf.as_mut_ptr().add(wp),
859                                    offset,
860                                );
861                            }
862                        }
863                        wp += offset;
864                    }
865                    i += offset + 1;
866                }
867                None => {
868                    let run_len = n - i;
869                    if run_len > 0 {
870                        if wp != i {
871                            unsafe {
872                                std::ptr::copy(
873                                    buf.as_ptr().add(i),
874                                    buf.as_mut_ptr().add(wp),
875                                    run_len,
876                                );
877                            }
878                        }
879                        wp += run_len;
880                    }
881                    break;
882                }
883            }
884        }
885        writer.write_all(&buf[..wp])?;
886    }
887    Ok(())
888}
889
890pub fn delete_squeeze(
891    delete_chars: &[u8],
892    squeeze_chars: &[u8],
893    reader: &mut impl Read,
894    writer: &mut impl Write,
895) -> io::Result<()> {
896    let delete_set = build_member_set(delete_chars);
897    let squeeze_set = build_member_set(squeeze_chars);
898    // Single buffer with in-place compaction
899    let mut buf = vec![0u8; STREAM_BUF];
900    let mut last_squeezed: u16 = 256;
901
902    loop {
903        let n = match reader.read(&mut buf) {
904            Ok(0) => break,
905            Ok(n) => n,
906            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
907            Err(e) => return Err(e),
908        };
909        let mut wp = 0;
910        unsafe {
911            let ptr = buf.as_mut_ptr();
912            for i in 0..n {
913                let b = *ptr.add(i);
914                if is_member(&delete_set, b) {
915                    continue;
916                }
917                if is_member(&squeeze_set, b) {
918                    if last_squeezed == b as u16 {
919                        continue;
920                    }
921                    last_squeezed = b as u16;
922                } else {
923                    last_squeezed = 256;
924                }
925                *ptr.add(wp) = b;
926                wp += 1;
927            }
928        }
929        writer.write_all(&buf[..wp])?;
930    }
931    Ok(())
932}
933
934pub fn squeeze(
935    squeeze_chars: &[u8],
936    reader: &mut impl Read,
937    writer: &mut impl Write,
938) -> io::Result<()> {
939    // Fast path: single squeeze char — bulk copy non-match runs
940    if squeeze_chars.len() == 1 {
941        return squeeze_single_stream(squeeze_chars[0], reader, writer);
942    }
943
944    let member = build_member_set(squeeze_chars);
945    // Single buffer with in-place compaction — eliminates outbuf allocation + memcpy
946    let mut buf = vec![0u8; STREAM_BUF];
947    let mut last_squeezed: u16 = 256;
948
949    loop {
950        let n = match reader.read(&mut buf) {
951            Ok(0) => break,
952            Ok(n) => n,
953            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
954            Err(e) => return Err(e),
955        };
956        let mut wp = 0;
957        unsafe {
958            let ptr = buf.as_mut_ptr();
959            for i in 0..n {
960                let b = *ptr.add(i);
961                if is_member(&member, b) {
962                    if last_squeezed == b as u16 {
963                        continue;
964                    }
965                    last_squeezed = b as u16;
966                } else {
967                    last_squeezed = 256;
968                }
969                *ptr.add(wp) = b;
970                wp += 1;
971            }
972        }
973        writer.write_all(&buf[..wp])?;
974    }
975    Ok(())
976}
977
978/// Squeeze a single character from a stream — in-place compaction with SIMD memchr.
979/// Single buffer: eliminates outbuf allocation and saves one full memcpy.
980/// Uses memchr for fast SIMD scanning, ptr::copy for in-place shift.
981fn squeeze_single_stream(
982    ch: u8,
983    reader: &mut impl Read,
984    writer: &mut impl Write,
985) -> io::Result<()> {
986    let mut buf = vec![0u8; STREAM_BUF];
987    let mut was_squeeze_char = false;
988
989    loop {
990        let n = match reader.read(&mut buf) {
991            Ok(0) => break,
992            Ok(n) => n,
993            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
994            Err(e) => return Err(e),
995        };
996
997        // In-place squeeze compaction using memchr for SIMD scanning.
998        // wp tracks write position (always <= read position i), so in-place is safe.
999        let mut wp = 0;
1000        let mut i = 0;
1001
1002        while i < n {
1003            // Cross-chunk continuation: skip squeeze chars from previous chunk
1004            if was_squeeze_char && buf[i] == ch {
1005                i += 1;
1006                while i < n && buf[i] == ch {
1007                    i += 1;
1008                }
1009                // was_squeeze_char stays true until we see a non-squeeze char
1010                if i >= n {
1011                    break;
1012                }
1013            }
1014
1015            // Find next occurrence of squeeze char using SIMD memchr
1016            match memchr::memchr(ch, &buf[i..n]) {
1017                Some(offset) => {
1018                    let run_len = offset;
1019                    // Shift non-squeeze run left in-place (skip if already in position)
1020                    if run_len > 0 {
1021                        if wp != i {
1022                            unsafe {
1023                                std::ptr::copy(
1024                                    buf.as_ptr().add(i),
1025                                    buf.as_mut_ptr().add(wp),
1026                                    run_len,
1027                                );
1028                            }
1029                        }
1030                        wp += run_len;
1031                    }
1032                    i += run_len;
1033
1034                    // Emit one squeeze char, skip consecutive duplicates
1035                    unsafe {
1036                        *buf.as_mut_ptr().add(wp) = ch;
1037                    }
1038                    wp += 1;
1039                    was_squeeze_char = true;
1040                    i += 1;
1041                    while i < n && buf[i] == ch {
1042                        i += 1;
1043                    }
1044                }
1045                None => {
1046                    // No more squeeze chars — shift remaining data
1047                    let run_len = n - i;
1048                    if run_len > 0 {
1049                        if wp != i {
1050                            unsafe {
1051                                std::ptr::copy(
1052                                    buf.as_ptr().add(i),
1053                                    buf.as_mut_ptr().add(wp),
1054                                    run_len,
1055                                );
1056                            }
1057                        }
1058                        wp += run_len;
1059                    }
1060                    was_squeeze_char = false;
1061                    break;
1062                }
1063            }
1064        }
1065
1066        writer.write_all(&buf[..wp])?;
1067    }
1068    Ok(())
1069}
1070
1071// ============================================================================
1072// Mmap-based functions (zero-copy input from byte slice)
1073// ============================================================================
1074
1075/// Translate bytes from an mmap'd byte slice — zero syscall reads.
1076/// Uses SIMD AVX2 for range-delta patterns (e.g., a-z → A-Z).
1077/// Chunked approach: 1MB buffer fits in L2 cache, avoids large allocations.
1078/// Translation is memory-bandwidth-bound (not compute-bound), so parallel
1079/// offers minimal gain but costs 100MB+ allocation + zero-init overhead.
1080pub fn translate_mmap(
1081    set1: &[u8],
1082    set2: &[u8],
1083    data: &[u8],
1084    writer: &mut impl Write,
1085) -> io::Result<()> {
1086    let table = build_translate_table(set1, set2);
1087    let kind = analyze_table(&table);
1088    #[cfg(target_arch = "x86_64")]
1089    let use_simd = has_avx2();
1090    #[cfg(not(target_arch = "x86_64"))]
1091    let use_simd = false;
1092
1093    if matches!(kind, TranslateKind::Identity) {
1094        return writer.write_all(data);
1095    }
1096
1097    // 1MB chunked path — reuses single buffer across chunks.
1098    // Better than allocating vec![0u8; data.len()] which zero-inits 100MB+.
1099    // 1MB fits in L2 cache for optimal SIMD throughput.
1100    let mut out = vec![0u8; BUF_SIZE];
1101    for chunk in data.chunks(BUF_SIZE) {
1102        translate_chunk_dispatch(chunk, &mut out[..chunk.len()], &table, &kind, use_simd);
1103        writer.write_all(&out[..chunk.len()])?;
1104    }
1105    Ok(())
1106}
1107
1108/// Translate + squeeze from mmap'd byte slice.
1109/// Single buffer: translate into buffer, then squeeze in-place (wp <= i always holds).
1110/// Eliminates second buffer allocation and reduces memory traffic.
1111pub fn translate_squeeze_mmap(
1112    set1: &[u8],
1113    set2: &[u8],
1114    data: &[u8],
1115    writer: &mut impl Write,
1116) -> io::Result<()> {
1117    let table = build_translate_table(set1, set2);
1118    let squeeze_set = build_member_set(set2);
1119    let kind = analyze_table(&table);
1120    #[cfg(target_arch = "x86_64")]
1121    let use_simd = has_avx2();
1122    #[cfg(not(target_arch = "x86_64"))]
1123    let use_simd = false;
1124
1125    // Single buffer: translate chunk→buf, then squeeze in-place within buf
1126    let mut buf = vec![0u8; BUF_SIZE];
1127    let mut last_squeezed: u16 = 256;
1128
1129    for chunk in data.chunks(BUF_SIZE) {
1130        // Phase 1: Translate into buf (may use SIMD)
1131        translate_chunk_dispatch(chunk, &mut buf[..chunk.len()], &table, &kind, use_simd);
1132
1133        // Phase 2: Squeeze in-place (wp <= i always, safe for overlapping writes)
1134        let mut wp = 0;
1135        unsafe {
1136            let ptr = buf.as_mut_ptr();
1137            for i in 0..chunk.len() {
1138                let b = *ptr.add(i);
1139                if is_member(&squeeze_set, b) {
1140                    if last_squeezed == b as u16 {
1141                        continue;
1142                    }
1143                    last_squeezed = b as u16;
1144                } else {
1145                    last_squeezed = 256;
1146                }
1147                *ptr.add(wp) = b;
1148                wp += 1;
1149            }
1150        }
1151        writer.write_all(&buf[..wp])?;
1152    }
1153    Ok(())
1154}
1155
1156/// Delete from mmap'd byte slice.
1157/// Uses SIMD memchr for single-character delete (common case).
1158/// For multi-char delete, uses 8-byte unrolled scan with bitset lookup.
1159pub fn delete_mmap(delete_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1160    // Fast path: single character delete uses SIMD memchr
1161    if delete_chars.len() == 1 {
1162        return delete_single_char_mmap(delete_chars[0], data, writer);
1163    }
1164
1165    // Fast path: 2-char delete uses SIMD memchr2 (bulk copy between matches)
1166    if delete_chars.len() == 2 {
1167        return delete_multi_memchr_mmap::<2>(delete_chars, data, writer);
1168    }
1169
1170    // Fast path: 3-char delete uses SIMD memchr3 (bulk copy between matches)
1171    if delete_chars.len() == 3 {
1172        return delete_multi_memchr_mmap::<3>(delete_chars, data, writer);
1173    }
1174
1175    let member = build_member_set(delete_chars);
1176    let mut outbuf = vec![0u8; BUF_SIZE];
1177
1178    for chunk in data.chunks(BUF_SIZE) {
1179        let mut out_pos = 0;
1180        let len = chunk.len();
1181        let mut i = 0;
1182
1183        // 8-byte branchless unrolled scan: always write, conditionally advance pointer.
1184        // Eliminates 8 branches per iteration for better throughput with large delete sets.
1185        while i + 8 <= len {
1186            unsafe {
1187                let b0 = *chunk.get_unchecked(i);
1188                let b1 = *chunk.get_unchecked(i + 1);
1189                let b2 = *chunk.get_unchecked(i + 2);
1190                let b3 = *chunk.get_unchecked(i + 3);
1191                let b4 = *chunk.get_unchecked(i + 4);
1192                let b5 = *chunk.get_unchecked(i + 5);
1193                let b6 = *chunk.get_unchecked(i + 6);
1194                let b7 = *chunk.get_unchecked(i + 7);
1195
1196                *outbuf.get_unchecked_mut(out_pos) = b0;
1197                out_pos += !is_member(&member, b0) as usize;
1198                *outbuf.get_unchecked_mut(out_pos) = b1;
1199                out_pos += !is_member(&member, b1) as usize;
1200                *outbuf.get_unchecked_mut(out_pos) = b2;
1201                out_pos += !is_member(&member, b2) as usize;
1202                *outbuf.get_unchecked_mut(out_pos) = b3;
1203                out_pos += !is_member(&member, b3) as usize;
1204                *outbuf.get_unchecked_mut(out_pos) = b4;
1205                out_pos += !is_member(&member, b4) as usize;
1206                *outbuf.get_unchecked_mut(out_pos) = b5;
1207                out_pos += !is_member(&member, b5) as usize;
1208                *outbuf.get_unchecked_mut(out_pos) = b6;
1209                out_pos += !is_member(&member, b6) as usize;
1210                *outbuf.get_unchecked_mut(out_pos) = b7;
1211                out_pos += !is_member(&member, b7) as usize;
1212            }
1213            i += 8;
1214        }
1215
1216        while i < len {
1217            unsafe {
1218                let b = *chunk.get_unchecked(i);
1219                *outbuf.get_unchecked_mut(out_pos) = b;
1220                out_pos += !is_member(&member, b) as usize;
1221            }
1222            i += 1;
1223        }
1224
1225        writer.write_all(&outbuf[..out_pos])?;
1226    }
1227    Ok(())
1228}
1229
1230/// Multi-character delete (2-3 chars) using SIMD memchr2/memchr3.
1231/// Chunked: processes 1MB at a time into contiguous output buffer, single write_all per chunk.
1232/// Eliminates millions of small BufWriter write_all calls.
1233fn delete_multi_memchr_mmap<const N: usize>(
1234    chars: &[u8],
1235    data: &[u8],
1236    writer: &mut impl Write,
1237) -> io::Result<()> {
1238    let mut outbuf = vec![0u8; BUF_SIZE];
1239
1240    for chunk in data.chunks(BUF_SIZE) {
1241        let mut wp = 0;
1242        let mut last = 0;
1243
1244        macro_rules! process_iter {
1245            ($iter:expr) => {
1246                for pos in $iter {
1247                    if pos > last {
1248                        let run = pos - last;
1249                        outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
1250                        wp += run;
1251                    }
1252                    last = pos + 1;
1253                }
1254            };
1255        }
1256
1257        if N == 2 {
1258            process_iter!(memchr::memchr2_iter(chars[0], chars[1], chunk));
1259        } else {
1260            process_iter!(memchr::memchr3_iter(chars[0], chars[1], chars[2], chunk));
1261        }
1262
1263        if last < chunk.len() {
1264            let run = chunk.len() - last;
1265            outbuf[wp..wp + run].copy_from_slice(&chunk[last..]);
1266            wp += run;
1267        }
1268        writer.write_all(&outbuf[..wp])?;
1269    }
1270    Ok(())
1271}
1272
1273/// Single-character delete from mmap using SIMD memchr.
1274/// Chunked: processes 1MB at a time into contiguous output buffer, single write_all per chunk.
1275/// Uses memchr_iter (precomputed SIMD state for entire chunk) + bulk copy_from_slice.
1276fn delete_single_char_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1277    let mut outbuf = vec![0u8; BUF_SIZE];
1278
1279    for chunk in data.chunks(BUF_SIZE) {
1280        let mut wp = 0;
1281        let mut last = 0;
1282        for pos in memchr::memchr_iter(ch, chunk) {
1283            if pos > last {
1284                let run = pos - last;
1285                outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
1286                wp += run;
1287            }
1288            last = pos + 1;
1289        }
1290        if last < chunk.len() {
1291            let run = chunk.len() - last;
1292            outbuf[wp..wp + run].copy_from_slice(&chunk[last..]);
1293            wp += run;
1294        }
1295        writer.write_all(&outbuf[..wp])?;
1296    }
1297    Ok(())
1298}
1299
1300/// Delete + squeeze from mmap'd byte slice.
1301pub fn delete_squeeze_mmap(
1302    delete_chars: &[u8],
1303    squeeze_chars: &[u8],
1304    data: &[u8],
1305    writer: &mut impl Write,
1306) -> io::Result<()> {
1307    let delete_set = build_member_set(delete_chars);
1308    let squeeze_set = build_member_set(squeeze_chars);
1309    let mut outbuf = vec![0u8; BUF_SIZE];
1310    let mut last_squeezed: u16 = 256;
1311
1312    for chunk in data.chunks(BUF_SIZE) {
1313        let mut out_pos = 0;
1314        for &b in chunk {
1315            if is_member(&delete_set, b) {
1316                continue;
1317            }
1318            if is_member(&squeeze_set, b) {
1319                if last_squeezed == b as u16 {
1320                    continue;
1321                }
1322                last_squeezed = b as u16;
1323            } else {
1324                last_squeezed = 256;
1325            }
1326            unsafe {
1327                *outbuf.get_unchecked_mut(out_pos) = b;
1328            }
1329            out_pos += 1;
1330        }
1331        writer.write_all(&outbuf[..out_pos])?;
1332    }
1333    Ok(())
1334}
1335
1336/// Squeeze from mmap'd byte slice.
1337/// Uses a two-pass approach: find runs of squeezable bytes with memchr,
1338/// then copy non-squeezed content in bulk.
1339pub fn squeeze_mmap(squeeze_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1340    // Fast path: single squeeze character — use SIMD memchr to find runs
1341    if squeeze_chars.len() == 1 {
1342        return squeeze_single_mmap(squeeze_chars[0], data, writer);
1343    }
1344
1345    // Fast path: 2-3 squeeze chars — use memchr2/memchr3 for SIMD scanning
1346    if squeeze_chars.len() == 2 {
1347        return squeeze_multi_mmap::<2>(squeeze_chars, data, writer);
1348    }
1349    if squeeze_chars.len() == 3 {
1350        return squeeze_multi_mmap::<3>(squeeze_chars, data, writer);
1351    }
1352
1353    // General path: chunked output buffer with member check
1354    let member = build_member_set(squeeze_chars);
1355    let mut outbuf = vec![0u8; BUF_SIZE];
1356    let mut last_squeezed: u16 = 256;
1357
1358    for chunk in data.chunks(BUF_SIZE) {
1359        let len = chunk.len();
1360        let mut wp = 0;
1361        let mut i = 0;
1362
1363        unsafe {
1364            let inp = chunk.as_ptr();
1365            let outp = outbuf.as_mut_ptr();
1366
1367            while i < len {
1368                let b = *inp.add(i);
1369                if is_member(&member, b) {
1370                    if last_squeezed != b as u16 {
1371                        *outp.add(wp) = b;
1372                        wp += 1;
1373                        last_squeezed = b as u16;
1374                    }
1375                    i += 1;
1376                    // Skip consecutive duplicates
1377                    while i < len && *inp.add(i) == b {
1378                        i += 1;
1379                    }
1380                } else {
1381                    last_squeezed = 256;
1382                    *outp.add(wp) = b;
1383                    wp += 1;
1384                    i += 1;
1385                }
1386            }
1387        }
1388        writer.write_all(&outbuf[..wp])?;
1389    }
1390    Ok(())
1391}
1392
1393/// Squeeze with 2-3 char sets using SIMD memchr2/memchr3 for fast scanning.
1394/// Batched: copies into output buffer, single write_all per buffer fill.
1395fn squeeze_multi_mmap<const N: usize>(
1396    chars: &[u8],
1397    data: &[u8],
1398    writer: &mut impl Write,
1399) -> io::Result<()> {
1400    let mut outbuf = vec![0u8; BUF_SIZE];
1401    let mut wp = 0;
1402    let mut last_squeezed: u16 = 256;
1403    let mut cursor = 0;
1404
1405    macro_rules! find_next {
1406        ($data:expr) => {
1407            if N == 2 {
1408                memchr::memchr2(chars[0], chars[1], $data)
1409            } else {
1410                memchr::memchr3(chars[0], chars[1], chars[2], $data)
1411            }
1412        };
1413    }
1414
1415    macro_rules! flush_and_copy {
1416        ($src:expr, $len:expr) => {
1417            if wp + $len > BUF_SIZE {
1418                writer.write_all(&outbuf[..wp])?;
1419                wp = 0;
1420            }
1421            if $len > BUF_SIZE {
1422                writer.write_all($src)?;
1423            } else {
1424                outbuf[wp..wp + $len].copy_from_slice($src);
1425                wp += $len;
1426            }
1427        };
1428    }
1429
1430    while cursor < data.len() {
1431        match find_next!(&data[cursor..]) {
1432            Some(offset) => {
1433                let pos = cursor + offset;
1434                let b = data[pos];
1435                // Copy non-member span + first squeeze char to output buffer
1436                if pos > cursor {
1437                    let span = pos - cursor;
1438                    flush_and_copy!(&data[cursor..pos], span);
1439                    last_squeezed = 256;
1440                }
1441                if last_squeezed != b as u16 {
1442                    if wp >= BUF_SIZE {
1443                        writer.write_all(&outbuf[..wp])?;
1444                        wp = 0;
1445                    }
1446                    outbuf[wp] = b;
1447                    wp += 1;
1448                    last_squeezed = b as u16;
1449                }
1450                // Skip consecutive duplicates of same byte
1451                let mut skip = pos + 1;
1452                while skip < data.len() && data[skip] == b {
1453                    skip += 1;
1454                }
1455                cursor = skip;
1456            }
1457            None => {
1458                let remaining = data.len() - cursor;
1459                flush_and_copy!(&data[cursor..], remaining);
1460                break;
1461            }
1462        }
1463    }
1464    if wp > 0 {
1465        writer.write_all(&outbuf[..wp])?;
1466    }
1467    Ok(())
1468}
1469
1470/// Squeeze a single repeated character from mmap'd data.
1471/// Uses a tight byte-at-a-time copy loop with 1MB output buffer.
1472/// Faster than memmem/IoSlice approach because:
1473/// - Single pass (no double scan), predictable branches
1474/// - Bulk write_all with 1MB chunks (fewer syscalls than writev with many small IoSlices)
1475fn squeeze_single_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1476    if data.is_empty() {
1477        return Ok(());
1478    }
1479
1480    // Fast path: no consecutive duplicates — zero-copy output
1481    if memchr::memmem::find(data, &[ch, ch]).is_none() {
1482        return writer.write_all(data);
1483    }
1484
1485    let mut outbuf = vec![0u8; BUF_SIZE];
1486    let len = data.len();
1487    let mut wp = 0;
1488
1489    unsafe {
1490        let inp = data.as_ptr();
1491        let outp = outbuf.as_mut_ptr();
1492        let mut i = 0;
1493
1494        while i < len {
1495            let b = *inp.add(i);
1496            i += 1;
1497            *outp.add(wp) = b;
1498            wp += 1;
1499
1500            // Skip consecutive duplicates of the squeeze char
1501            if b == ch {
1502                while i < len && *inp.add(i) == ch {
1503                    i += 1;
1504                }
1505            }
1506
1507            if wp == BUF_SIZE {
1508                writer.write_all(&outbuf[..wp])?;
1509                wp = 0;
1510            }
1511        }
1512    }
1513
1514    if wp > 0 {
1515        writer.write_all(&outbuf[..wp])?;
1516    }
1517    Ok(())
1518}