Skip to main content

coreutils_rs/tr/
core.rs

1use std::io::{self, Read, Write};
2
3const BUF_SIZE: usize = 1024 * 1024; // 1MB — fits L2/L3 cache for locality
4
5/// Stream buffer: 256KB — process data immediately after each read().
6/// Stays in L2 cache, matches typical kernel pipe buffer size.
7/// Unlike fill_buf (which loops to accumulate 8MB = ~128 syscalls for pipes),
8/// we read once and process immediately, matching GNU tr's approach.
9const STREAM_BUF: usize = 256 * 1024;
10
11/// Build a 256-byte lookup table mapping set1[i] -> set2[i].
12#[inline]
13fn build_translate_table(set1: &[u8], set2: &[u8]) -> [u8; 256] {
14    let mut table: [u8; 256] = std::array::from_fn(|i| i as u8);
15    let last = set2.last().copied();
16    for (i, &from) in set1.iter().enumerate() {
17        table[from as usize] = if i < set2.len() {
18            set2[i]
19        } else {
20            last.unwrap_or(from)
21        };
22    }
23    table
24}
25
26/// Build a 256-bit (32-byte) membership set for O(1) byte lookup.
27#[inline]
28fn build_member_set(chars: &[u8]) -> [u8; 32] {
29    let mut set = [0u8; 32];
30    for &ch in chars {
31        set[ch as usize >> 3] |= 1 << (ch & 7);
32    }
33    set
34}
35
36#[inline(always)]
37fn is_member(set: &[u8; 32], ch: u8) -> bool {
38    unsafe { (*set.get_unchecked(ch as usize >> 3) & (1 << (ch & 7))) != 0 }
39}
40
41// ============================================================================
42// Table analysis for SIMD fast paths
43// ============================================================================
44
45/// Classification of a translation table for SIMD optimization.
46#[allow(dead_code)] // Fields used only on x86_64 (AVX2 SIMD path)
47enum TranslateKind {
48    /// Table is identity — no translation needed.
49    Identity,
50    /// A contiguous range [lo, hi] with constant wrapping-add delta; identity elsewhere.
51    RangeDelta { lo: u8, hi: u8, delta: u8 },
52    /// Arbitrary translation — use general lookup table.
53    General,
54}
55
56/// Analyze a translation table to detect SIMD-optimizable patterns.
57fn analyze_table(table: &[u8; 256]) -> TranslateKind {
58    let mut first_delta: Option<u8> = None;
59    let mut lo: u8 = 0;
60    let mut hi: u8 = 0;
61    let mut count: u32 = 0;
62
63    for i in 0..256u16 {
64        let actual = table[i as usize];
65        if actual != i as u8 {
66            let delta = actual.wrapping_sub(i as u8);
67            count += 1;
68            match first_delta {
69                None => {
70                    first_delta = Some(delta);
71                    lo = i as u8;
72                    hi = i as u8;
73                }
74                Some(d) if d == delta => {
75                    hi = i as u8;
76                }
77                _ => return TranslateKind::General,
78            }
79        }
80    }
81
82    match (count, first_delta) {
83        (0, _) => TranslateKind::Identity,
84        (c, Some(delta)) if c == (hi as u32 - lo as u32 + 1) => {
85            TranslateKind::RangeDelta { lo, hi, delta }
86        }
87        _ => TranslateKind::General,
88    }
89}
90
91// ============================================================================
92// SIMD translation (x86_64 AVX2)
93// ============================================================================
94
95#[cfg(target_arch = "x86_64")]
96mod simd_tr {
97    /// Translate bytes in range [lo, hi] by adding `delta` (wrapping), leave others unchanged.
98    /// Processes 128 bytes per iteration (4x unroll) using AVX2.
99    ///
100    /// SAFETY: Caller must ensure AVX2 is available and out.len() >= data.len().
101    #[target_feature(enable = "avx2")]
102    pub unsafe fn range_delta(data: &[u8], out: &mut [u8], lo: u8, hi: u8, delta: u8) {
103        unsafe {
104            use std::arch::x86_64::*;
105
106            let lo_vec = _mm256_set1_epi8(lo as i8);
107            let range_vec = _mm256_set1_epi8((hi - lo) as i8);
108            let delta_vec = _mm256_set1_epi8(delta as i8);
109
110            let len = data.len();
111            let inp = data.as_ptr();
112            let outp = out.as_mut_ptr();
113            let mut i = 0usize;
114
115            // 4x unrolled: process 128 bytes per iteration for better ILP
116            while i + 128 <= len {
117                let v0 = _mm256_loadu_si256(inp.add(i) as *const __m256i);
118                let v1 = _mm256_loadu_si256(inp.add(i + 32) as *const __m256i);
119                let v2 = _mm256_loadu_si256(inp.add(i + 64) as *const __m256i);
120                let v3 = _mm256_loadu_si256(inp.add(i + 96) as *const __m256i);
121
122                let d0 = _mm256_sub_epi8(v0, lo_vec);
123                let d1 = _mm256_sub_epi8(v1, lo_vec);
124                let d2 = _mm256_sub_epi8(v2, lo_vec);
125                let d3 = _mm256_sub_epi8(v3, lo_vec);
126
127                let m0 = _mm256_cmpeq_epi8(_mm256_min_epu8(d0, range_vec), d0);
128                let m1 = _mm256_cmpeq_epi8(_mm256_min_epu8(d1, range_vec), d1);
129                let m2 = _mm256_cmpeq_epi8(_mm256_min_epu8(d2, range_vec), d2);
130                let m3 = _mm256_cmpeq_epi8(_mm256_min_epu8(d3, range_vec), d3);
131
132                let r0 = _mm256_add_epi8(v0, _mm256_and_si256(m0, delta_vec));
133                let r1 = _mm256_add_epi8(v1, _mm256_and_si256(m1, delta_vec));
134                let r2 = _mm256_add_epi8(v2, _mm256_and_si256(m2, delta_vec));
135                let r3 = _mm256_add_epi8(v3, _mm256_and_si256(m3, delta_vec));
136
137                _mm256_storeu_si256(outp.add(i) as *mut __m256i, r0);
138                _mm256_storeu_si256(outp.add(i + 32) as *mut __m256i, r1);
139                _mm256_storeu_si256(outp.add(i + 64) as *mut __m256i, r2);
140                _mm256_storeu_si256(outp.add(i + 96) as *mut __m256i, r3);
141                i += 128;
142            }
143
144            while i + 32 <= len {
145                let v = _mm256_loadu_si256(inp.add(i) as *const __m256i);
146                let diff = _mm256_sub_epi8(v, lo_vec);
147                let mask = _mm256_cmpeq_epi8(_mm256_min_epu8(diff, range_vec), diff);
148                let result = _mm256_add_epi8(v, _mm256_and_si256(mask, delta_vec));
149                _mm256_storeu_si256(outp.add(i) as *mut __m256i, result);
150                i += 32;
151            }
152
153            while i < len {
154                let b = *inp.add(i);
155                *outp.add(i) = if b.wrapping_sub(lo) <= (hi - lo) {
156                    b.wrapping_add(delta)
157                } else {
158                    b
159                };
160                i += 1;
161            }
162        }
163    }
164
165    /// General 256-byte table lookup using 16-way vpshufb (nibble decomposition).
166    /// Splits each input byte into high nibble (selects one of 16 shuffle tables)
167    /// and low nibble (index within the shuffle table). Processes 64 bytes per
168    /// iteration (2x unrolled) for instruction-level parallelism.
169    ///
170    /// SAFETY: Caller must ensure AVX2 is available and out.len() >= data.len().
171    #[target_feature(enable = "avx2")]
172    pub unsafe fn general_lookup(data: &[u8], out: &mut [u8], table: &[u8; 256]) {
173        unsafe {
174            use std::arch::x86_64::*;
175
176            // Build 16 vpshufb LUTs from the 256-byte translation table.
177            // LUT[h] covers table[h*16..h*16+16], broadcast to both 128-bit lanes.
178            let tp = table.as_ptr();
179            let mut luts = [_mm256_setzero_si256(); 16];
180            let mut h = 0;
181            while h < 16 {
182                luts[h] =
183                    _mm256_broadcastsi128_si256(_mm_loadu_si128(tp.add(h * 16) as *const __m128i));
184                h += 1;
185            }
186
187            let lo_mask = _mm256_set1_epi8(0x0F);
188            let len = data.len();
189            let inp = data.as_ptr();
190            let outp = out.as_mut_ptr();
191            let mut i = 0;
192
193            // 2x unrolled: process 64 bytes per iteration
194            while i + 64 <= len {
195                let v0 = _mm256_loadu_si256(inp.add(i) as *const __m256i);
196                let v1 = _mm256_loadu_si256(inp.add(i + 32) as *const __m256i);
197
198                let lo0 = _mm256_and_si256(v0, lo_mask);
199                let lo1 = _mm256_and_si256(v1, lo_mask);
200                let hi0 = _mm256_and_si256(_mm256_srli_epi16(v0, 4), lo_mask);
201                let hi1 = _mm256_and_si256(_mm256_srli_epi16(v1, 4), lo_mask);
202
203                let mut r0 = _mm256_setzero_si256();
204                let mut r1 = _mm256_setzero_si256();
205
206                // Process all 16 high-nibble values.
207                // For each h, bytes where high_nibble == h get their result from luts[h].
208                macro_rules! do_nib {
209                    ($h:literal) => {
210                        let hv = _mm256_set1_epi8($h);
211                        let lut = luts[$h as usize];
212                        let m0 = _mm256_cmpeq_epi8(hi0, hv);
213                        let m1 = _mm256_cmpeq_epi8(hi1, hv);
214                        r0 = _mm256_or_si256(
215                            r0,
216                            _mm256_and_si256(_mm256_shuffle_epi8(lut, lo0), m0),
217                        );
218                        r1 = _mm256_or_si256(
219                            r1,
220                            _mm256_and_si256(_mm256_shuffle_epi8(lut, lo1), m1),
221                        );
222                    };
223                }
224
225                do_nib!(0);
226                do_nib!(1);
227                do_nib!(2);
228                do_nib!(3);
229                do_nib!(4);
230                do_nib!(5);
231                do_nib!(6);
232                do_nib!(7);
233                do_nib!(8);
234                do_nib!(9);
235                do_nib!(10);
236                do_nib!(11);
237                do_nib!(12);
238                do_nib!(13);
239                do_nib!(14);
240                do_nib!(15);
241
242                _mm256_storeu_si256(outp.add(i) as *mut __m256i, r0);
243                _mm256_storeu_si256(outp.add(i + 32) as *mut __m256i, r1);
244                i += 64;
245            }
246
247            // Single vector tail
248            while i + 32 <= len {
249                let v = _mm256_loadu_si256(inp.add(i) as *const __m256i);
250                let lo = _mm256_and_si256(v, lo_mask);
251                let hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), lo_mask);
252
253                let mut result = _mm256_setzero_si256();
254                let mut hh = 0u8;
255                while hh < 16 {
256                    let hv = _mm256_set1_epi8(hh as i8);
257                    let m = _mm256_cmpeq_epi8(hi, hv);
258                    result = _mm256_or_si256(
259                        result,
260                        _mm256_and_si256(_mm256_shuffle_epi8(luts[hh as usize], lo), m),
261                    );
262                    hh += 1;
263                }
264
265                _mm256_storeu_si256(outp.add(i) as *mut __m256i, result);
266                i += 32;
267            }
268
269            // Scalar tail
270            while i < len {
271                *outp.add(i) = *table.get_unchecked(*inp.add(i) as usize);
272                i += 1;
273            }
274        }
275    }
276
277    /// In-place general 256-byte table lookup using 16-way vpshufb.
278    ///
279    /// SAFETY: Caller must ensure AVX2 is available.
280    #[target_feature(enable = "avx2")]
281    pub unsafe fn general_lookup_inplace(data: &mut [u8], table: &[u8; 256]) {
282        unsafe {
283            use std::arch::x86_64::*;
284
285            let tp = table.as_ptr();
286            let mut luts = [_mm256_setzero_si256(); 16];
287            let mut h = 0;
288            while h < 16 {
289                luts[h] =
290                    _mm256_broadcastsi128_si256(_mm_loadu_si128(tp.add(h * 16) as *const __m128i));
291                h += 1;
292            }
293
294            let lo_mask = _mm256_set1_epi8(0x0F);
295            let len = data.len();
296            let ptr = data.as_mut_ptr();
297            let mut i = 0;
298
299            while i + 64 <= len {
300                let v0 = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
301                let v1 = _mm256_loadu_si256(ptr.add(i + 32) as *const __m256i);
302
303                let lo0 = _mm256_and_si256(v0, lo_mask);
304                let lo1 = _mm256_and_si256(v1, lo_mask);
305                let hi0 = _mm256_and_si256(_mm256_srli_epi16(v0, 4), lo_mask);
306                let hi1 = _mm256_and_si256(_mm256_srli_epi16(v1, 4), lo_mask);
307
308                let mut r0 = _mm256_setzero_si256();
309                let mut r1 = _mm256_setzero_si256();
310
311                macro_rules! do_nib {
312                    ($h:literal) => {
313                        let hv = _mm256_set1_epi8($h);
314                        let lut = luts[$h as usize];
315                        let m0 = _mm256_cmpeq_epi8(hi0, hv);
316                        let m1 = _mm256_cmpeq_epi8(hi1, hv);
317                        r0 = _mm256_or_si256(
318                            r0,
319                            _mm256_and_si256(_mm256_shuffle_epi8(lut, lo0), m0),
320                        );
321                        r1 = _mm256_or_si256(
322                            r1,
323                            _mm256_and_si256(_mm256_shuffle_epi8(lut, lo1), m1),
324                        );
325                    };
326                }
327
328                do_nib!(0);
329                do_nib!(1);
330                do_nib!(2);
331                do_nib!(3);
332                do_nib!(4);
333                do_nib!(5);
334                do_nib!(6);
335                do_nib!(7);
336                do_nib!(8);
337                do_nib!(9);
338                do_nib!(10);
339                do_nib!(11);
340                do_nib!(12);
341                do_nib!(13);
342                do_nib!(14);
343                do_nib!(15);
344
345                _mm256_storeu_si256(ptr.add(i) as *mut __m256i, r0);
346                _mm256_storeu_si256(ptr.add(i + 32) as *mut __m256i, r1);
347                i += 64;
348            }
349
350            while i + 32 <= len {
351                let v = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
352                let lo = _mm256_and_si256(v, lo_mask);
353                let hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), lo_mask);
354
355                let mut result = _mm256_setzero_si256();
356                let mut hh = 0u8;
357                while hh < 16 {
358                    let hv = _mm256_set1_epi8(hh as i8);
359                    let m = _mm256_cmpeq_epi8(hi, hv);
360                    result = _mm256_or_si256(
361                        result,
362                        _mm256_and_si256(_mm256_shuffle_epi8(luts[hh as usize], lo), m),
363                    );
364                    hh += 1;
365                }
366
367                _mm256_storeu_si256(ptr.add(i) as *mut __m256i, result);
368                i += 32;
369            }
370
371            while i < len {
372                let b = *ptr.add(i);
373                *ptr.add(i) = *table.get_unchecked(b as usize);
374                i += 1;
375            }
376        }
377    }
378
379    /// In-place SIMD translate for stdin path.
380    ///
381    /// SAFETY: Caller must ensure AVX2 is available.
382    #[target_feature(enable = "avx2")]
383    pub unsafe fn range_delta_inplace(data: &mut [u8], lo: u8, hi: u8, delta: u8) {
384        unsafe {
385            use std::arch::x86_64::*;
386
387            let lo_vec = _mm256_set1_epi8(lo as i8);
388            let range_vec = _mm256_set1_epi8((hi - lo) as i8);
389            let delta_vec = _mm256_set1_epi8(delta as i8);
390
391            let len = data.len();
392            let ptr = data.as_mut_ptr();
393            let mut i = 0usize;
394
395            while i + 128 <= len {
396                let v0 = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
397                let v1 = _mm256_loadu_si256(ptr.add(i + 32) as *const __m256i);
398                let v2 = _mm256_loadu_si256(ptr.add(i + 64) as *const __m256i);
399                let v3 = _mm256_loadu_si256(ptr.add(i + 96) as *const __m256i);
400
401                let d0 = _mm256_sub_epi8(v0, lo_vec);
402                let d1 = _mm256_sub_epi8(v1, lo_vec);
403                let d2 = _mm256_sub_epi8(v2, lo_vec);
404                let d3 = _mm256_sub_epi8(v3, lo_vec);
405
406                let m0 = _mm256_cmpeq_epi8(_mm256_min_epu8(d0, range_vec), d0);
407                let m1 = _mm256_cmpeq_epi8(_mm256_min_epu8(d1, range_vec), d1);
408                let m2 = _mm256_cmpeq_epi8(_mm256_min_epu8(d2, range_vec), d2);
409                let m3 = _mm256_cmpeq_epi8(_mm256_min_epu8(d3, range_vec), d3);
410
411                let r0 = _mm256_add_epi8(v0, _mm256_and_si256(m0, delta_vec));
412                let r1 = _mm256_add_epi8(v1, _mm256_and_si256(m1, delta_vec));
413                let r2 = _mm256_add_epi8(v2, _mm256_and_si256(m2, delta_vec));
414                let r3 = _mm256_add_epi8(v3, _mm256_and_si256(m3, delta_vec));
415
416                _mm256_storeu_si256(ptr.add(i) as *mut __m256i, r0);
417                _mm256_storeu_si256(ptr.add(i + 32) as *mut __m256i, r1);
418                _mm256_storeu_si256(ptr.add(i + 64) as *mut __m256i, r2);
419                _mm256_storeu_si256(ptr.add(i + 96) as *mut __m256i, r3);
420                i += 128;
421            }
422
423            while i + 32 <= len {
424                let v = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
425                let diff = _mm256_sub_epi8(v, lo_vec);
426                let mask = _mm256_cmpeq_epi8(_mm256_min_epu8(diff, range_vec), diff);
427                let result = _mm256_add_epi8(v, _mm256_and_si256(mask, delta_vec));
428                _mm256_storeu_si256(ptr.add(i) as *mut __m256i, result);
429                i += 32;
430            }
431
432            while i < len {
433                let b = *ptr.add(i);
434                *ptr.add(i) = if b.wrapping_sub(lo) <= (hi - lo) {
435                    b.wrapping_add(delta)
436                } else {
437                    b
438                };
439                i += 1;
440            }
441        }
442    }
443}
444
445/// AVX2 nibble-based set membership classifier.
446/// Uses vpshufb to test 32 bytes at a time for membership in a byte set.
447/// Returns a `__m256i` where each byte is 0xFF if the byte is NOT in the set, 0x00 if it IS.
448/// Check if AVX2 is available at runtime.
449#[cfg(target_arch = "x86_64")]
450#[inline(always)]
451fn has_avx2() -> bool {
452    is_x86_feature_detected!("avx2")
453}
454
455/// Translate a chunk of bytes using a lookup table — unrolled 8-byte inner loop.
456#[inline(always)]
457fn translate_chunk(chunk: &[u8], out: &mut [u8], table: &[u8; 256]) {
458    let len = chunk.len();
459    let mut i = 0;
460    while i + 8 <= len {
461        unsafe {
462            *out.get_unchecked_mut(i) = *table.get_unchecked(*chunk.get_unchecked(i) as usize);
463            *out.get_unchecked_mut(i + 1) =
464                *table.get_unchecked(*chunk.get_unchecked(i + 1) as usize);
465            *out.get_unchecked_mut(i + 2) =
466                *table.get_unchecked(*chunk.get_unchecked(i + 2) as usize);
467            *out.get_unchecked_mut(i + 3) =
468                *table.get_unchecked(*chunk.get_unchecked(i + 3) as usize);
469            *out.get_unchecked_mut(i + 4) =
470                *table.get_unchecked(*chunk.get_unchecked(i + 4) as usize);
471            *out.get_unchecked_mut(i + 5) =
472                *table.get_unchecked(*chunk.get_unchecked(i + 5) as usize);
473            *out.get_unchecked_mut(i + 6) =
474                *table.get_unchecked(*chunk.get_unchecked(i + 6) as usize);
475            *out.get_unchecked_mut(i + 7) =
476                *table.get_unchecked(*chunk.get_unchecked(i + 7) as usize);
477        }
478        i += 8;
479    }
480    while i < len {
481        unsafe {
482            *out.get_unchecked_mut(i) = *table.get_unchecked(*chunk.get_unchecked(i) as usize);
483        }
484        i += 1;
485    }
486}
487
488/// In-place translate for stdin path — avoids separate output buffer.
489#[inline(always)]
490fn translate_inplace(data: &mut [u8], table: &[u8; 256]) {
491    let len = data.len();
492    let ptr = data.as_mut_ptr();
493    let tab = table.as_ptr();
494
495    unsafe {
496        let mut i = 0;
497        while i + 8 <= len {
498            *ptr.add(i) = *tab.add(*ptr.add(i) as usize);
499            *ptr.add(i + 1) = *tab.add(*ptr.add(i + 1) as usize);
500            *ptr.add(i + 2) = *tab.add(*ptr.add(i + 2) as usize);
501            *ptr.add(i + 3) = *tab.add(*ptr.add(i + 3) as usize);
502            *ptr.add(i + 4) = *tab.add(*ptr.add(i + 4) as usize);
503            *ptr.add(i + 5) = *tab.add(*ptr.add(i + 5) as usize);
504            *ptr.add(i + 6) = *tab.add(*ptr.add(i + 6) as usize);
505            *ptr.add(i + 7) = *tab.add(*ptr.add(i + 7) as usize);
506            i += 8;
507        }
508        while i < len {
509            *ptr.add(i) = *tab.add(*ptr.add(i) as usize);
510            i += 1;
511        }
512    }
513}
514
515// ============================================================================
516// Dispatch: choose SIMD or scalar based on table analysis
517// ============================================================================
518
519/// Translate a chunk using the best available method.
520/// `use_simd` is cached at call site to avoid per-chunk atomic loads.
521#[inline]
522fn translate_chunk_dispatch(
523    chunk: &[u8],
524    out: &mut [u8],
525    table: &[u8; 256],
526    kind: &TranslateKind,
527    _use_simd: bool,
528) {
529    match kind {
530        TranslateKind::Identity => {
531            out[..chunk.len()].copy_from_slice(chunk);
532        }
533        #[cfg(target_arch = "x86_64")]
534        TranslateKind::RangeDelta { lo, hi, delta } => {
535            if _use_simd {
536                unsafe { simd_tr::range_delta(chunk, out, *lo, *hi, *delta) };
537                return;
538            }
539            translate_chunk(chunk, out, table);
540        }
541        #[cfg(not(target_arch = "x86_64"))]
542        TranslateKind::RangeDelta { .. } => {
543            translate_chunk(chunk, out, table);
544        }
545        #[cfg(target_arch = "x86_64")]
546        TranslateKind::General => {
547            if _use_simd {
548                unsafe { simd_tr::general_lookup(chunk, out, table) };
549                return;
550            }
551            translate_chunk(chunk, out, table);
552        }
553        #[cfg(not(target_arch = "x86_64"))]
554        TranslateKind::General => {
555            translate_chunk(chunk, out, table);
556        }
557    }
558}
559
560/// In-place translate dispatch.
561/// `use_simd` is cached at call site to avoid per-chunk atomic loads.
562#[inline]
563fn translate_inplace_dispatch(
564    data: &mut [u8],
565    table: &[u8; 256],
566    kind: &TranslateKind,
567    _use_simd: bool,
568) {
569    match kind {
570        TranslateKind::Identity => {}
571        #[cfg(target_arch = "x86_64")]
572        TranslateKind::RangeDelta { lo, hi, delta } => {
573            if _use_simd {
574                unsafe { simd_tr::range_delta_inplace(data, *lo, *hi, *delta) };
575                return;
576            }
577            translate_inplace(data, table);
578        }
579        #[cfg(not(target_arch = "x86_64"))]
580        TranslateKind::RangeDelta { .. } => {
581            translate_inplace(data, table);
582        }
583        #[cfg(target_arch = "x86_64")]
584        TranslateKind::General => {
585            if _use_simd {
586                unsafe { simd_tr::general_lookup_inplace(data, table) };
587                return;
588            }
589            translate_inplace(data, table);
590        }
591        #[cfg(not(target_arch = "x86_64"))]
592        TranslateKind::General => {
593            translate_inplace(data, table);
594        }
595    }
596}
597
598// ============================================================================
599// Streaming functions (Read + Write)
600// Process data immediately after each read() — no fill_buf accumulation.
601// Uses 256KB buffer (L2-friendly) instead of 8MB.
602// ============================================================================
603
604pub fn translate(
605    set1: &[u8],
606    set2: &[u8],
607    reader: &mut impl Read,
608    writer: &mut impl Write,
609) -> io::Result<()> {
610    let table = build_translate_table(set1, set2);
611    let kind = analyze_table(&table);
612    #[cfg(target_arch = "x86_64")]
613    let use_simd = has_avx2();
614    #[cfg(not(target_arch = "x86_64"))]
615    let use_simd = false;
616
617    // Use 1MB buffer for fewer read() syscalls on pipes
618    let mut buf = vec![0u8; BUF_SIZE];
619    loop {
620        let n = match reader.read(&mut buf) {
621            Ok(0) => break,
622            Ok(n) => n,
623            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
624            Err(e) => return Err(e),
625        };
626        translate_inplace_dispatch(&mut buf[..n], &table, &kind, use_simd);
627        writer.write_all(&buf[..n])?;
628    }
629    Ok(())
630}
631
632pub fn translate_squeeze(
633    set1: &[u8],
634    set2: &[u8],
635    reader: &mut impl Read,
636    writer: &mut impl Write,
637) -> io::Result<()> {
638    let table = build_translate_table(set1, set2);
639    let squeeze_set = build_member_set(set2);
640    let kind = analyze_table(&table);
641    #[cfg(target_arch = "x86_64")]
642    let use_simd = has_avx2();
643    #[cfg(not(target_arch = "x86_64"))]
644    let use_simd = false;
645
646    // Single buffer: SIMD translate in-place, then squeeze in-place compaction.
647    // Eliminates outbuf allocation and saves one full memcpy of data.
648    let mut buf = vec![0u8; STREAM_BUF];
649    let mut last_squeezed: u16 = 256;
650
651    loop {
652        let n = match reader.read(&mut buf) {
653            Ok(0) => break,
654            Ok(n) => n,
655            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
656            Err(e) => return Err(e),
657        };
658        // Phase 1: SIMD translate in-place (uses AVX2 when available)
659        translate_inplace_dispatch(&mut buf[..n], &table, &kind, use_simd);
660        // Phase 2: squeeze in-place compaction (wp <= i always, safe)
661        let mut wp = 0;
662        unsafe {
663            let ptr = buf.as_mut_ptr();
664            for i in 0..n {
665                let b = *ptr.add(i);
666                if is_member(&squeeze_set, b) {
667                    if last_squeezed == b as u16 {
668                        continue;
669                    }
670                    last_squeezed = b as u16;
671                } else {
672                    last_squeezed = 256;
673                }
674                *ptr.add(wp) = b;
675                wp += 1;
676            }
677        }
678        writer.write_all(&buf[..wp])?;
679    }
680    Ok(())
681}
682
683pub fn delete(
684    delete_chars: &[u8],
685    reader: &mut impl Read,
686    writer: &mut impl Write,
687) -> io::Result<()> {
688    // Fast path: single character delete using SIMD memchr
689    if delete_chars.len() == 1 {
690        return delete_single_streaming(delete_chars[0], reader, writer);
691    }
692
693    // Fast paths: 2-3 char delete using SIMD memchr2/memchr3
694    if delete_chars.len() <= 3 {
695        return delete_multi_streaming(delete_chars, reader, writer);
696    }
697
698    let member = build_member_set(delete_chars);
699    // Single buffer with in-place compaction — eliminates outbuf allocation + memcpy
700    let mut buf = vec![0u8; STREAM_BUF];
701
702    loop {
703        let n = match reader.read(&mut buf) {
704            Ok(0) => break,
705            Ok(n) => n,
706            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
707            Err(e) => return Err(e),
708        };
709        let mut wp = 0;
710        unsafe {
711            let ptr = buf.as_mut_ptr();
712            let mut i = 0;
713            // 8-byte unrolled in-place compaction
714            while i + 8 <= n {
715                let b0 = *ptr.add(i);
716                let b1 = *ptr.add(i + 1);
717                let b2 = *ptr.add(i + 2);
718                let b3 = *ptr.add(i + 3);
719                let b4 = *ptr.add(i + 4);
720                let b5 = *ptr.add(i + 5);
721                let b6 = *ptr.add(i + 6);
722                let b7 = *ptr.add(i + 7);
723
724                if !is_member(&member, b0) {
725                    *ptr.add(wp) = b0;
726                    wp += 1;
727                }
728                if !is_member(&member, b1) {
729                    *ptr.add(wp) = b1;
730                    wp += 1;
731                }
732                if !is_member(&member, b2) {
733                    *ptr.add(wp) = b2;
734                    wp += 1;
735                }
736                if !is_member(&member, b3) {
737                    *ptr.add(wp) = b3;
738                    wp += 1;
739                }
740                if !is_member(&member, b4) {
741                    *ptr.add(wp) = b4;
742                    wp += 1;
743                }
744                if !is_member(&member, b5) {
745                    *ptr.add(wp) = b5;
746                    wp += 1;
747                }
748                if !is_member(&member, b6) {
749                    *ptr.add(wp) = b6;
750                    wp += 1;
751                }
752                if !is_member(&member, b7) {
753                    *ptr.add(wp) = b7;
754                    wp += 1;
755                }
756                i += 8;
757            }
758            while i < n {
759                let b = *ptr.add(i);
760                if !is_member(&member, b) {
761                    *ptr.add(wp) = b;
762                    wp += 1;
763                }
764                i += 1;
765            }
766        }
767        writer.write_all(&buf[..wp])?;
768    }
769    Ok(())
770}
771
772/// Single-character delete from a reader — in-place compaction with SIMD memchr.
773/// Uses memchr for SIMD scanning + ptr::copy for in-place shift + single write_all.
774fn delete_single_streaming(
775    ch: u8,
776    reader: &mut impl Read,
777    writer: &mut impl Write,
778) -> io::Result<()> {
779    let mut buf = vec![0u8; STREAM_BUF];
780    loop {
781        let n = match reader.read(&mut buf) {
782            Ok(0) => break,
783            Ok(n) => n,
784            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
785            Err(e) => return Err(e),
786        };
787        let mut wp = 0;
788        let mut i = 0;
789        while i < n {
790            match memchr::memchr(ch, &buf[i..n]) {
791                Some(offset) => {
792                    if offset > 0 {
793                        if wp != i {
794                            unsafe {
795                                std::ptr::copy(
796                                    buf.as_ptr().add(i),
797                                    buf.as_mut_ptr().add(wp),
798                                    offset,
799                                );
800                            }
801                        }
802                        wp += offset;
803                    }
804                    i += offset + 1; // skip the deleted char
805                }
806                None => {
807                    let run_len = n - i;
808                    if run_len > 0 {
809                        if wp != i {
810                            unsafe {
811                                std::ptr::copy(
812                                    buf.as_ptr().add(i),
813                                    buf.as_mut_ptr().add(wp),
814                                    run_len,
815                                );
816                            }
817                        }
818                        wp += run_len;
819                    }
820                    break;
821                }
822            }
823        }
824        writer.write_all(&buf[..wp])?;
825    }
826    Ok(())
827}
828
829/// Multi-character delete (2-3 chars) — in-place compaction with SIMD memchr2/memchr3.
830fn delete_multi_streaming(
831    chars: &[u8],
832    reader: &mut impl Read,
833    writer: &mut impl Write,
834) -> io::Result<()> {
835    let mut buf = vec![0u8; STREAM_BUF];
836    loop {
837        let n = match reader.read(&mut buf) {
838            Ok(0) => break,
839            Ok(n) => n,
840            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
841            Err(e) => return Err(e),
842        };
843        let mut wp = 0;
844        let mut i = 0;
845        while i < n {
846            let found = if chars.len() == 2 {
847                memchr::memchr2(chars[0], chars[1], &buf[i..n])
848            } else {
849                memchr::memchr3(chars[0], chars[1], chars[2], &buf[i..n])
850            };
851            match found {
852                Some(offset) => {
853                    if offset > 0 {
854                        if wp != i {
855                            unsafe {
856                                std::ptr::copy(
857                                    buf.as_ptr().add(i),
858                                    buf.as_mut_ptr().add(wp),
859                                    offset,
860                                );
861                            }
862                        }
863                        wp += offset;
864                    }
865                    i += offset + 1;
866                }
867                None => {
868                    let run_len = n - i;
869                    if run_len > 0 {
870                        if wp != i {
871                            unsafe {
872                                std::ptr::copy(
873                                    buf.as_ptr().add(i),
874                                    buf.as_mut_ptr().add(wp),
875                                    run_len,
876                                );
877                            }
878                        }
879                        wp += run_len;
880                    }
881                    break;
882                }
883            }
884        }
885        writer.write_all(&buf[..wp])?;
886    }
887    Ok(())
888}
889
890pub fn delete_squeeze(
891    delete_chars: &[u8],
892    squeeze_chars: &[u8],
893    reader: &mut impl Read,
894    writer: &mut impl Write,
895) -> io::Result<()> {
896    let delete_set = build_member_set(delete_chars);
897    let squeeze_set = build_member_set(squeeze_chars);
898    // Single buffer with in-place compaction
899    let mut buf = vec![0u8; STREAM_BUF];
900    let mut last_squeezed: u16 = 256;
901
902    loop {
903        let n = match reader.read(&mut buf) {
904            Ok(0) => break,
905            Ok(n) => n,
906            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
907            Err(e) => return Err(e),
908        };
909        let mut wp = 0;
910        unsafe {
911            let ptr = buf.as_mut_ptr();
912            for i in 0..n {
913                let b = *ptr.add(i);
914                if is_member(&delete_set, b) {
915                    continue;
916                }
917                if is_member(&squeeze_set, b) {
918                    if last_squeezed == b as u16 {
919                        continue;
920                    }
921                    last_squeezed = b as u16;
922                } else {
923                    last_squeezed = 256;
924                }
925                *ptr.add(wp) = b;
926                wp += 1;
927            }
928        }
929        writer.write_all(&buf[..wp])?;
930    }
931    Ok(())
932}
933
934pub fn squeeze(
935    squeeze_chars: &[u8],
936    reader: &mut impl Read,
937    writer: &mut impl Write,
938) -> io::Result<()> {
939    // Fast path: single squeeze char — bulk copy non-match runs
940    if squeeze_chars.len() == 1 {
941        return squeeze_single_stream(squeeze_chars[0], reader, writer);
942    }
943
944    let member = build_member_set(squeeze_chars);
945    // Single buffer with in-place compaction — eliminates outbuf allocation + memcpy
946    let mut buf = vec![0u8; STREAM_BUF];
947    let mut last_squeezed: u16 = 256;
948
949    loop {
950        let n = match reader.read(&mut buf) {
951            Ok(0) => break,
952            Ok(n) => n,
953            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
954            Err(e) => return Err(e),
955        };
956        let mut wp = 0;
957        unsafe {
958            let ptr = buf.as_mut_ptr();
959            for i in 0..n {
960                let b = *ptr.add(i);
961                if is_member(&member, b) {
962                    if last_squeezed == b as u16 {
963                        continue;
964                    }
965                    last_squeezed = b as u16;
966                } else {
967                    last_squeezed = 256;
968                }
969                *ptr.add(wp) = b;
970                wp += 1;
971            }
972        }
973        writer.write_all(&buf[..wp])?;
974    }
975    Ok(())
976}
977
978/// Squeeze a single character from a stream — in-place compaction with SIMD memchr.
979/// Single buffer: eliminates outbuf allocation and saves one full memcpy.
980/// Uses memchr for fast SIMD scanning, ptr::copy for in-place shift.
981fn squeeze_single_stream(
982    ch: u8,
983    reader: &mut impl Read,
984    writer: &mut impl Write,
985) -> io::Result<()> {
986    let mut buf = vec![0u8; STREAM_BUF];
987    let mut was_squeeze_char = false;
988
989    loop {
990        let n = match reader.read(&mut buf) {
991            Ok(0) => break,
992            Ok(n) => n,
993            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
994            Err(e) => return Err(e),
995        };
996
997        // In-place squeeze compaction using memchr for SIMD scanning.
998        // wp tracks write position (always <= read position i), so in-place is safe.
999        let mut wp = 0;
1000        let mut i = 0;
1001
1002        while i < n {
1003            // Cross-chunk continuation: skip squeeze chars from previous chunk
1004            if was_squeeze_char && buf[i] == ch {
1005                i += 1;
1006                while i < n && buf[i] == ch {
1007                    i += 1;
1008                }
1009                // was_squeeze_char stays true until we see a non-squeeze char
1010                if i >= n {
1011                    break;
1012                }
1013            }
1014
1015            // Find next occurrence of squeeze char using SIMD memchr
1016            match memchr::memchr(ch, &buf[i..n]) {
1017                Some(offset) => {
1018                    let run_len = offset;
1019                    // Shift non-squeeze run left in-place (skip if already in position)
1020                    if run_len > 0 {
1021                        if wp != i {
1022                            unsafe {
1023                                std::ptr::copy(
1024                                    buf.as_ptr().add(i),
1025                                    buf.as_mut_ptr().add(wp),
1026                                    run_len,
1027                                );
1028                            }
1029                        }
1030                        wp += run_len;
1031                    }
1032                    i += run_len;
1033
1034                    // Emit one squeeze char, skip consecutive duplicates
1035                    unsafe {
1036                        *buf.as_mut_ptr().add(wp) = ch;
1037                    }
1038                    wp += 1;
1039                    was_squeeze_char = true;
1040                    i += 1;
1041                    while i < n && buf[i] == ch {
1042                        i += 1;
1043                    }
1044                }
1045                None => {
1046                    // No more squeeze chars — shift remaining data
1047                    let run_len = n - i;
1048                    if run_len > 0 {
1049                        if wp != i {
1050                            unsafe {
1051                                std::ptr::copy(
1052                                    buf.as_ptr().add(i),
1053                                    buf.as_mut_ptr().add(wp),
1054                                    run_len,
1055                                );
1056                            }
1057                        }
1058                        wp += run_len;
1059                    }
1060                    was_squeeze_char = false;
1061                    break;
1062                }
1063            }
1064        }
1065
1066        writer.write_all(&buf[..wp])?;
1067    }
1068    Ok(())
1069}
1070
1071// ============================================================================
1072// Mmap-based functions (zero-copy input from byte slice)
1073// ============================================================================
1074
1075/// Translate bytes from an mmap'd byte slice — zero syscall reads.
1076/// Uses SIMD AVX2 for range-delta patterns (e.g., a-z → A-Z).
1077/// Chunked approach: 1MB buffer fits in L2 cache, avoids large allocations.
1078/// Translation is memory-bandwidth-bound (not compute-bound), so parallel
1079/// offers minimal gain but costs 100MB+ allocation + zero-init overhead.
1080pub fn translate_mmap(
1081    set1: &[u8],
1082    set2: &[u8],
1083    data: &[u8],
1084    writer: &mut impl Write,
1085) -> io::Result<()> {
1086    let table = build_translate_table(set1, set2);
1087    let kind = analyze_table(&table);
1088    #[cfg(target_arch = "x86_64")]
1089    let use_simd = has_avx2();
1090    #[cfg(not(target_arch = "x86_64"))]
1091    let use_simd = false;
1092
1093    if matches!(kind, TranslateKind::Identity) {
1094        return writer.write_all(data);
1095    }
1096
1097    // Chunked path — reuses single buffer across chunks.
1098    // Size buffer to min(data.len(), 1MB) to avoid over-allocating for small files.
1099    let buf_size = data.len().min(BUF_SIZE);
1100    let mut out = vec![0u8; buf_size];
1101    for chunk in data.chunks(buf_size) {
1102        translate_chunk_dispatch(chunk, &mut out[..chunk.len()], &table, &kind, use_simd);
1103        writer.write_all(&out[..chunk.len()])?;
1104    }
1105    Ok(())
1106}
1107
1108/// Translate + squeeze from mmap'd byte slice.
1109/// Single buffer: translate into buffer, then squeeze in-place (wp <= i always holds).
1110/// Eliminates second buffer allocation and reduces memory traffic.
1111pub fn translate_squeeze_mmap(
1112    set1: &[u8],
1113    set2: &[u8],
1114    data: &[u8],
1115    writer: &mut impl Write,
1116) -> io::Result<()> {
1117    let table = build_translate_table(set1, set2);
1118    let squeeze_set = build_member_set(set2);
1119    let kind = analyze_table(&table);
1120    #[cfg(target_arch = "x86_64")]
1121    let use_simd = has_avx2();
1122    #[cfg(not(target_arch = "x86_64"))]
1123    let use_simd = false;
1124
1125    // Single buffer: translate chunk→buf, then squeeze in-place within buf
1126    let buf_size = data.len().min(BUF_SIZE);
1127    let mut buf = vec![0u8; buf_size];
1128    let mut last_squeezed: u16 = 256;
1129
1130    for chunk in data.chunks(buf_size) {
1131        // Phase 1: Translate into buf (may use SIMD)
1132        translate_chunk_dispatch(chunk, &mut buf[..chunk.len()], &table, &kind, use_simd);
1133
1134        // Phase 2: Squeeze in-place (wp <= i always, safe for overlapping writes)
1135        let mut wp = 0;
1136        unsafe {
1137            let ptr = buf.as_mut_ptr();
1138            for i in 0..chunk.len() {
1139                let b = *ptr.add(i);
1140                if is_member(&squeeze_set, b) {
1141                    if last_squeezed == b as u16 {
1142                        continue;
1143                    }
1144                    last_squeezed = b as u16;
1145                } else {
1146                    last_squeezed = 256;
1147                }
1148                *ptr.add(wp) = b;
1149                wp += 1;
1150            }
1151        }
1152        writer.write_all(&buf[..wp])?;
1153    }
1154    Ok(())
1155}
1156
1157/// Delete from mmap'd byte slice.
1158/// Uses SIMD memchr for single-character delete (common case).
1159/// For multi-char delete, uses 8-byte unrolled scan with bitset lookup.
1160pub fn delete_mmap(delete_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1161    // Fast path: single character delete uses SIMD memchr
1162    if delete_chars.len() == 1 {
1163        return delete_single_char_mmap(delete_chars[0], data, writer);
1164    }
1165
1166    // Fast path: 2-char delete uses SIMD memchr2 (bulk copy between matches)
1167    if delete_chars.len() == 2 {
1168        return delete_multi_memchr_mmap::<2>(delete_chars, data, writer);
1169    }
1170
1171    // Fast path: 3-char delete uses SIMD memchr3 (bulk copy between matches)
1172    if delete_chars.len() == 3 {
1173        return delete_multi_memchr_mmap::<3>(delete_chars, data, writer);
1174    }
1175
1176    let member = build_member_set(delete_chars);
1177    let buf_size = data.len().min(BUF_SIZE);
1178    let mut outbuf = vec![0u8; buf_size];
1179
1180    for chunk in data.chunks(buf_size) {
1181        let mut out_pos = 0;
1182        let len = chunk.len();
1183        let mut i = 0;
1184
1185        // 8-byte branchless unrolled scan: always write, conditionally advance pointer.
1186        // Eliminates 8 branches per iteration for better throughput with large delete sets.
1187        while i + 8 <= len {
1188            unsafe {
1189                let b0 = *chunk.get_unchecked(i);
1190                let b1 = *chunk.get_unchecked(i + 1);
1191                let b2 = *chunk.get_unchecked(i + 2);
1192                let b3 = *chunk.get_unchecked(i + 3);
1193                let b4 = *chunk.get_unchecked(i + 4);
1194                let b5 = *chunk.get_unchecked(i + 5);
1195                let b6 = *chunk.get_unchecked(i + 6);
1196                let b7 = *chunk.get_unchecked(i + 7);
1197
1198                *outbuf.get_unchecked_mut(out_pos) = b0;
1199                out_pos += !is_member(&member, b0) as usize;
1200                *outbuf.get_unchecked_mut(out_pos) = b1;
1201                out_pos += !is_member(&member, b1) as usize;
1202                *outbuf.get_unchecked_mut(out_pos) = b2;
1203                out_pos += !is_member(&member, b2) as usize;
1204                *outbuf.get_unchecked_mut(out_pos) = b3;
1205                out_pos += !is_member(&member, b3) as usize;
1206                *outbuf.get_unchecked_mut(out_pos) = b4;
1207                out_pos += !is_member(&member, b4) as usize;
1208                *outbuf.get_unchecked_mut(out_pos) = b5;
1209                out_pos += !is_member(&member, b5) as usize;
1210                *outbuf.get_unchecked_mut(out_pos) = b6;
1211                out_pos += !is_member(&member, b6) as usize;
1212                *outbuf.get_unchecked_mut(out_pos) = b7;
1213                out_pos += !is_member(&member, b7) as usize;
1214            }
1215            i += 8;
1216        }
1217
1218        while i < len {
1219            unsafe {
1220                let b = *chunk.get_unchecked(i);
1221                *outbuf.get_unchecked_mut(out_pos) = b;
1222                out_pos += !is_member(&member, b) as usize;
1223            }
1224            i += 1;
1225        }
1226
1227        writer.write_all(&outbuf[..out_pos])?;
1228    }
1229    Ok(())
1230}
1231
1232/// Multi-character delete (2-3 chars) using SIMD memchr2/memchr3.
1233/// Chunked: processes 1MB at a time into contiguous output buffer, single write_all per chunk.
1234/// Eliminates millions of small BufWriter write_all calls.
1235fn delete_multi_memchr_mmap<const N: usize>(
1236    chars: &[u8],
1237    data: &[u8],
1238    writer: &mut impl Write,
1239) -> io::Result<()> {
1240    let buf_size = data.len().min(BUF_SIZE);
1241    let mut outbuf = vec![0u8; buf_size];
1242
1243    for chunk in data.chunks(buf_size) {
1244        let mut wp = 0;
1245        let mut last = 0;
1246
1247        macro_rules! process_iter {
1248            ($iter:expr) => {
1249                for pos in $iter {
1250                    if pos > last {
1251                        let run = pos - last;
1252                        outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
1253                        wp += run;
1254                    }
1255                    last = pos + 1;
1256                }
1257            };
1258        }
1259
1260        if N == 2 {
1261            process_iter!(memchr::memchr2_iter(chars[0], chars[1], chunk));
1262        } else {
1263            process_iter!(memchr::memchr3_iter(chars[0], chars[1], chars[2], chunk));
1264        }
1265
1266        if last < chunk.len() {
1267            let run = chunk.len() - last;
1268            outbuf[wp..wp + run].copy_from_slice(&chunk[last..]);
1269            wp += run;
1270        }
1271        writer.write_all(&outbuf[..wp])?;
1272    }
1273    Ok(())
1274}
1275
1276/// Single-character delete from mmap using SIMD memchr.
1277/// Chunked: processes 1MB at a time into contiguous output buffer, single write_all per chunk.
1278/// Uses memchr_iter (precomputed SIMD state for entire chunk) + bulk copy_from_slice.
1279fn delete_single_char_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1280    let buf_size = data.len().min(BUF_SIZE);
1281    let mut outbuf = vec![0u8; buf_size];
1282
1283    for chunk in data.chunks(buf_size) {
1284        let mut wp = 0;
1285        let mut last = 0;
1286        for pos in memchr::memchr_iter(ch, chunk) {
1287            if pos > last {
1288                let run = pos - last;
1289                outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
1290                wp += run;
1291            }
1292            last = pos + 1;
1293        }
1294        if last < chunk.len() {
1295            let run = chunk.len() - last;
1296            outbuf[wp..wp + run].copy_from_slice(&chunk[last..]);
1297            wp += run;
1298        }
1299        writer.write_all(&outbuf[..wp])?;
1300    }
1301    Ok(())
1302}
1303
1304/// Delete + squeeze from mmap'd byte slice.
1305pub fn delete_squeeze_mmap(
1306    delete_chars: &[u8],
1307    squeeze_chars: &[u8],
1308    data: &[u8],
1309    writer: &mut impl Write,
1310) -> io::Result<()> {
1311    let delete_set = build_member_set(delete_chars);
1312    let squeeze_set = build_member_set(squeeze_chars);
1313    let buf_size = data.len().min(BUF_SIZE);
1314    let mut outbuf = vec![0u8; buf_size];
1315    let mut last_squeezed: u16 = 256;
1316
1317    for chunk in data.chunks(buf_size) {
1318        let mut out_pos = 0;
1319        for &b in chunk {
1320            if is_member(&delete_set, b) {
1321                continue;
1322            }
1323            if is_member(&squeeze_set, b) {
1324                if last_squeezed == b as u16 {
1325                    continue;
1326                }
1327                last_squeezed = b as u16;
1328            } else {
1329                last_squeezed = 256;
1330            }
1331            unsafe {
1332                *outbuf.get_unchecked_mut(out_pos) = b;
1333            }
1334            out_pos += 1;
1335        }
1336        writer.write_all(&outbuf[..out_pos])?;
1337    }
1338    Ok(())
1339}
1340
1341/// Squeeze from mmap'd byte slice.
1342/// Uses a two-pass approach: find runs of squeezable bytes with memchr,
1343/// then copy non-squeezed content in bulk.
1344pub fn squeeze_mmap(squeeze_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1345    // Fast path: single squeeze character — use SIMD memchr to find runs
1346    if squeeze_chars.len() == 1 {
1347        return squeeze_single_mmap(squeeze_chars[0], data, writer);
1348    }
1349
1350    // Fast path: 2-3 squeeze chars — use memchr2/memchr3 for SIMD scanning
1351    if squeeze_chars.len() == 2 {
1352        return squeeze_multi_mmap::<2>(squeeze_chars, data, writer);
1353    }
1354    if squeeze_chars.len() == 3 {
1355        return squeeze_multi_mmap::<3>(squeeze_chars, data, writer);
1356    }
1357
1358    // General path: chunked output buffer with member check
1359    let member = build_member_set(squeeze_chars);
1360    let buf_size = data.len().min(BUF_SIZE);
1361    let mut outbuf = vec![0u8; buf_size];
1362    let mut last_squeezed: u16 = 256;
1363
1364    for chunk in data.chunks(buf_size) {
1365        let len = chunk.len();
1366        let mut wp = 0;
1367        let mut i = 0;
1368
1369        unsafe {
1370            let inp = chunk.as_ptr();
1371            let outp = outbuf.as_mut_ptr();
1372
1373            while i < len {
1374                let b = *inp.add(i);
1375                if is_member(&member, b) {
1376                    if last_squeezed != b as u16 {
1377                        *outp.add(wp) = b;
1378                        wp += 1;
1379                        last_squeezed = b as u16;
1380                    }
1381                    i += 1;
1382                    // Skip consecutive duplicates
1383                    while i < len && *inp.add(i) == b {
1384                        i += 1;
1385                    }
1386                } else {
1387                    last_squeezed = 256;
1388                    *outp.add(wp) = b;
1389                    wp += 1;
1390                    i += 1;
1391                }
1392            }
1393        }
1394        writer.write_all(&outbuf[..wp])?;
1395    }
1396    Ok(())
1397}
1398
1399/// Squeeze with 2-3 char sets using SIMD memchr2/memchr3 for fast scanning.
1400/// Batched: copies into output buffer, single write_all per buffer fill.
1401fn squeeze_multi_mmap<const N: usize>(
1402    chars: &[u8],
1403    data: &[u8],
1404    writer: &mut impl Write,
1405) -> io::Result<()> {
1406    let buf_size = data.len().min(BUF_SIZE);
1407    let mut outbuf = vec![0u8; buf_size];
1408    let mut wp = 0;
1409    let mut last_squeezed: u16 = 256;
1410    let mut cursor = 0;
1411
1412    macro_rules! find_next {
1413        ($data:expr) => {
1414            if N == 2 {
1415                memchr::memchr2(chars[0], chars[1], $data)
1416            } else {
1417                memchr::memchr3(chars[0], chars[1], chars[2], $data)
1418            }
1419        };
1420    }
1421
1422    macro_rules! flush_and_copy {
1423        ($src:expr, $len:expr) => {
1424            if wp + $len > buf_size {
1425                writer.write_all(&outbuf[..wp])?;
1426                wp = 0;
1427            }
1428            if $len > buf_size {
1429                writer.write_all($src)?;
1430            } else {
1431                outbuf[wp..wp + $len].copy_from_slice($src);
1432                wp += $len;
1433            }
1434        };
1435    }
1436
1437    while cursor < data.len() {
1438        match find_next!(&data[cursor..]) {
1439            Some(offset) => {
1440                let pos = cursor + offset;
1441                let b = data[pos];
1442                // Copy non-member span + first squeeze char to output buffer
1443                if pos > cursor {
1444                    let span = pos - cursor;
1445                    flush_and_copy!(&data[cursor..pos], span);
1446                    last_squeezed = 256;
1447                }
1448                if last_squeezed != b as u16 {
1449                    if wp >= buf_size {
1450                        writer.write_all(&outbuf[..wp])?;
1451                        wp = 0;
1452                    }
1453                    outbuf[wp] = b;
1454                    wp += 1;
1455                    last_squeezed = b as u16;
1456                }
1457                // Skip consecutive duplicates of same byte
1458                let mut skip = pos + 1;
1459                while skip < data.len() && data[skip] == b {
1460                    skip += 1;
1461                }
1462                cursor = skip;
1463            }
1464            None => {
1465                let remaining = data.len() - cursor;
1466                flush_and_copy!(&data[cursor..], remaining);
1467                break;
1468            }
1469        }
1470    }
1471    if wp > 0 {
1472        writer.write_all(&outbuf[..wp])?;
1473    }
1474    Ok(())
1475}
1476
1477/// Squeeze a single repeated character from mmap'd data.
1478/// Uses SIMD memchr for fast scanning + bulk copy of non-squeeze regions.
1479/// For each squeeze run: copy gap in bulk, emit one squeeze char, skip duplicates.
1480fn squeeze_single_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1481    if data.is_empty() {
1482        return Ok(());
1483    }
1484
1485    // Fast path: no consecutive duplicates — zero-copy output
1486    if memchr::memmem::find(data, &[ch, ch]).is_none() {
1487        return writer.write_all(data);
1488    }
1489
1490    let buf_size = data.len().min(BUF_SIZE);
1491    let mut outbuf = vec![0u8; buf_size];
1492    let len = data.len();
1493    let mut wp = 0;
1494    let mut cursor = 0;
1495
1496    while cursor < len {
1497        // SIMD scan for next occurrence of squeeze char
1498        match memchr::memchr(ch, &data[cursor..]) {
1499            Some(offset) => {
1500                let pos = cursor + offset;
1501
1502                // Bulk copy non-squeeze region [cursor..pos]
1503                let gap = pos - cursor;
1504                if gap > 0 {
1505                    if wp + gap > buf_size {
1506                        writer.write_all(&outbuf[..wp])?;
1507                        wp = 0;
1508                    }
1509                    if gap > buf_size {
1510                        // Huge gap: write directly
1511                        writer.write_all(&data[cursor..pos])?;
1512                    } else {
1513                        outbuf[wp..wp + gap].copy_from_slice(&data[cursor..pos]);
1514                        wp += gap;
1515                    }
1516                }
1517
1518                // Emit one squeeze char
1519                if wp >= buf_size {
1520                    writer.write_all(&outbuf[..wp])?;
1521                    wp = 0;
1522                }
1523                outbuf[wp] = ch;
1524                wp += 1;
1525
1526                // Skip consecutive duplicates
1527                cursor = pos + 1;
1528                while cursor < len && data[cursor] == ch {
1529                    cursor += 1;
1530                }
1531            }
1532            None => {
1533                // No more squeeze chars — bulk copy remainder
1534                let remaining = len - cursor;
1535                if remaining > 0 {
1536                    if wp + remaining > buf_size {
1537                        writer.write_all(&outbuf[..wp])?;
1538                        wp = 0;
1539                    }
1540                    if remaining > buf_size {
1541                        writer.write_all(&data[cursor..])?;
1542                    } else {
1543                        outbuf[wp..wp + remaining].copy_from_slice(&data[cursor..]);
1544                        wp += remaining;
1545                    }
1546                }
1547                break;
1548            }
1549        }
1550    }
1551
1552    if wp > 0 {
1553        writer.write_all(&outbuf[..wp])?;
1554    }
1555    Ok(())
1556}