Skip to main content

coreutils_rs/tr/
core.rs

1use rayon::prelude::*;
2use std::io::{self, Read, Write};
3
4/// Main processing buffer: 4MB — large enough to amortize write() syscall overhead
5/// while still fitting in L3 cache.
6const BUF_SIZE: usize = 4 * 1024 * 1024;
7
8/// Stream buffer: 1MB — process data immediately after each read().
9/// Larger than typical pipe buffer (64KB) to batch multiple reads into one process cycle.
10const STREAM_BUF: usize = 1024 * 1024;
11
12/// Minimum data size for parallel processing (2MB).
13/// Below this, rayon thread pool overhead (~5-10μs) exceeds the benefit.
14const PAR_THRESHOLD: usize = 2 * 1024 * 1024;
15
16/// Build a 256-byte lookup table mapping set1[i] -> set2[i].
17#[inline]
18fn build_translate_table(set1: &[u8], set2: &[u8]) -> [u8; 256] {
19    let mut table: [u8; 256] = std::array::from_fn(|i| i as u8);
20    let last = set2.last().copied();
21    for (i, &from) in set1.iter().enumerate() {
22        table[from as usize] = if i < set2.len() {
23            set2[i]
24        } else {
25            last.unwrap_or(from)
26        };
27    }
28    table
29}
30
31/// Build a 256-bit (32-byte) membership set for O(1) byte lookup.
32#[inline]
33fn build_member_set(chars: &[u8]) -> [u8; 32] {
34    let mut set = [0u8; 32];
35    for &ch in chars {
36        set[ch as usize >> 3] |= 1 << (ch & 7);
37    }
38    set
39}
40
41#[inline(always)]
42fn is_member(set: &[u8; 32], ch: u8) -> bool {
43    unsafe { (*set.get_unchecked(ch as usize >> 3) & (1 << (ch & 7))) != 0 }
44}
45
46// ============================================================================
47// Table analysis for SIMD fast paths
48// ============================================================================
49
50/// Classification of a translation table for SIMD optimization.
51#[allow(dead_code)] // Fields used only on x86_64 (AVX2 SIMD path)
52enum TranslateKind {
53    /// Table is identity — no translation needed.
54    Identity,
55    /// A contiguous range [lo, hi] with constant wrapping-add delta; identity elsewhere.
56    RangeDelta { lo: u8, hi: u8, delta: u8 },
57    /// Arbitrary translation — use general lookup table.
58    General,
59}
60
61/// Analyze a translation table to detect SIMD-optimizable patterns.
62fn analyze_table(table: &[u8; 256]) -> TranslateKind {
63    let mut first_delta: Option<u8> = None;
64    let mut lo: u8 = 0;
65    let mut hi: u8 = 0;
66    let mut count: u32 = 0;
67
68    for i in 0..256u16 {
69        let actual = table[i as usize];
70        if actual != i as u8 {
71            let delta = actual.wrapping_sub(i as u8);
72            count += 1;
73            match first_delta {
74                None => {
75                    first_delta = Some(delta);
76                    lo = i as u8;
77                    hi = i as u8;
78                }
79                Some(d) if d == delta => {
80                    hi = i as u8;
81                }
82                _ => return TranslateKind::General,
83            }
84        }
85    }
86
87    match (count, first_delta) {
88        (0, _) => TranslateKind::Identity,
89        (c, Some(delta)) if c == (hi as u32 - lo as u32 + 1) => {
90            TranslateKind::RangeDelta { lo, hi, delta }
91        }
92        _ => TranslateKind::General,
93    }
94}
95
96// ============================================================================
97// SIMD translation (x86_64 AVX2)
98// ============================================================================
99
100#[cfg(target_arch = "x86_64")]
101mod simd_tr {
102    /// Translate bytes in range [lo, hi] by adding `delta` (wrapping), leave others unchanged.
103    /// Processes 128 bytes per iteration (4x unroll) using AVX2.
104    ///
105    /// SAFETY: Caller must ensure AVX2 is available and out.len() >= data.len().
106    #[target_feature(enable = "avx2")]
107    pub unsafe fn range_delta(data: &[u8], out: &mut [u8], lo: u8, hi: u8, delta: u8) {
108        unsafe {
109            use std::arch::x86_64::*;
110
111            let lo_vec = _mm256_set1_epi8(lo as i8);
112            let range_vec = _mm256_set1_epi8((hi - lo) as i8);
113            let delta_vec = _mm256_set1_epi8(delta as i8);
114
115            let len = data.len();
116            let inp = data.as_ptr();
117            let outp = out.as_mut_ptr();
118            let mut i = 0usize;
119
120            // 4x unrolled: process 128 bytes per iteration for better ILP
121            while i + 128 <= len {
122                let v0 = _mm256_loadu_si256(inp.add(i) as *const __m256i);
123                let v1 = _mm256_loadu_si256(inp.add(i + 32) as *const __m256i);
124                let v2 = _mm256_loadu_si256(inp.add(i + 64) as *const __m256i);
125                let v3 = _mm256_loadu_si256(inp.add(i + 96) as *const __m256i);
126
127                let d0 = _mm256_sub_epi8(v0, lo_vec);
128                let d1 = _mm256_sub_epi8(v1, lo_vec);
129                let d2 = _mm256_sub_epi8(v2, lo_vec);
130                let d3 = _mm256_sub_epi8(v3, lo_vec);
131
132                let m0 = _mm256_cmpeq_epi8(_mm256_min_epu8(d0, range_vec), d0);
133                let m1 = _mm256_cmpeq_epi8(_mm256_min_epu8(d1, range_vec), d1);
134                let m2 = _mm256_cmpeq_epi8(_mm256_min_epu8(d2, range_vec), d2);
135                let m3 = _mm256_cmpeq_epi8(_mm256_min_epu8(d3, range_vec), d3);
136
137                let r0 = _mm256_add_epi8(v0, _mm256_and_si256(m0, delta_vec));
138                let r1 = _mm256_add_epi8(v1, _mm256_and_si256(m1, delta_vec));
139                let r2 = _mm256_add_epi8(v2, _mm256_and_si256(m2, delta_vec));
140                let r3 = _mm256_add_epi8(v3, _mm256_and_si256(m3, delta_vec));
141
142                _mm256_storeu_si256(outp.add(i) as *mut __m256i, r0);
143                _mm256_storeu_si256(outp.add(i + 32) as *mut __m256i, r1);
144                _mm256_storeu_si256(outp.add(i + 64) as *mut __m256i, r2);
145                _mm256_storeu_si256(outp.add(i + 96) as *mut __m256i, r3);
146                i += 128;
147            }
148
149            while i + 32 <= len {
150                let v = _mm256_loadu_si256(inp.add(i) as *const __m256i);
151                let diff = _mm256_sub_epi8(v, lo_vec);
152                let mask = _mm256_cmpeq_epi8(_mm256_min_epu8(diff, range_vec), diff);
153                let result = _mm256_add_epi8(v, _mm256_and_si256(mask, delta_vec));
154                _mm256_storeu_si256(outp.add(i) as *mut __m256i, result);
155                i += 32;
156            }
157
158            while i < len {
159                let b = *inp.add(i);
160                *outp.add(i) = if b.wrapping_sub(lo) <= (hi - lo) {
161                    b.wrapping_add(delta)
162                } else {
163                    b
164                };
165                i += 1;
166            }
167        }
168    }
169
170    /// General 256-byte table lookup using 16-way vpshufb (nibble decomposition).
171    /// Splits each input byte into high nibble (selects one of 16 shuffle tables)
172    /// and low nibble (index within the shuffle table). Processes 64 bytes per
173    /// iteration (2x unrolled) for instruction-level parallelism.
174    ///
175    /// SAFETY: Caller must ensure AVX2 is available and out.len() >= data.len().
176    #[target_feature(enable = "avx2")]
177    pub unsafe fn general_lookup(data: &[u8], out: &mut [u8], table: &[u8; 256]) {
178        unsafe {
179            use std::arch::x86_64::*;
180
181            // Build 16 vpshufb LUTs from the 256-byte translation table.
182            // LUT[h] covers table[h*16..h*16+16], broadcast to both 128-bit lanes.
183            let tp = table.as_ptr();
184            let mut luts = [_mm256_setzero_si256(); 16];
185            let mut h = 0;
186            while h < 16 {
187                luts[h] =
188                    _mm256_broadcastsi128_si256(_mm_loadu_si128(tp.add(h * 16) as *const __m128i));
189                h += 1;
190            }
191
192            let lo_mask = _mm256_set1_epi8(0x0F);
193            let len = data.len();
194            let inp = data.as_ptr();
195            let outp = out.as_mut_ptr();
196            let mut i = 0;
197
198            // 2x unrolled: process 64 bytes per iteration
199            while i + 64 <= len {
200                let v0 = _mm256_loadu_si256(inp.add(i) as *const __m256i);
201                let v1 = _mm256_loadu_si256(inp.add(i + 32) as *const __m256i);
202
203                let lo0 = _mm256_and_si256(v0, lo_mask);
204                let lo1 = _mm256_and_si256(v1, lo_mask);
205                let hi0 = _mm256_and_si256(_mm256_srli_epi16(v0, 4), lo_mask);
206                let hi1 = _mm256_and_si256(_mm256_srli_epi16(v1, 4), lo_mask);
207
208                let mut r0 = _mm256_setzero_si256();
209                let mut r1 = _mm256_setzero_si256();
210
211                // Process all 16 high-nibble values.
212                // For each h, bytes where high_nibble == h get their result from luts[h].
213                macro_rules! do_nib {
214                    ($h:literal) => {
215                        let hv = _mm256_set1_epi8($h);
216                        let lut = luts[$h as usize];
217                        let m0 = _mm256_cmpeq_epi8(hi0, hv);
218                        let m1 = _mm256_cmpeq_epi8(hi1, hv);
219                        r0 = _mm256_or_si256(
220                            r0,
221                            _mm256_and_si256(_mm256_shuffle_epi8(lut, lo0), m0),
222                        );
223                        r1 = _mm256_or_si256(
224                            r1,
225                            _mm256_and_si256(_mm256_shuffle_epi8(lut, lo1), m1),
226                        );
227                    };
228                }
229
230                do_nib!(0);
231                do_nib!(1);
232                do_nib!(2);
233                do_nib!(3);
234                do_nib!(4);
235                do_nib!(5);
236                do_nib!(6);
237                do_nib!(7);
238                do_nib!(8);
239                do_nib!(9);
240                do_nib!(10);
241                do_nib!(11);
242                do_nib!(12);
243                do_nib!(13);
244                do_nib!(14);
245                do_nib!(15);
246
247                _mm256_storeu_si256(outp.add(i) as *mut __m256i, r0);
248                _mm256_storeu_si256(outp.add(i + 32) as *mut __m256i, r1);
249                i += 64;
250            }
251
252            // Single vector tail
253            while i + 32 <= len {
254                let v = _mm256_loadu_si256(inp.add(i) as *const __m256i);
255                let lo = _mm256_and_si256(v, lo_mask);
256                let hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), lo_mask);
257
258                let mut result = _mm256_setzero_si256();
259                let mut hh = 0u8;
260                while hh < 16 {
261                    let hv = _mm256_set1_epi8(hh as i8);
262                    let m = _mm256_cmpeq_epi8(hi, hv);
263                    result = _mm256_or_si256(
264                        result,
265                        _mm256_and_si256(_mm256_shuffle_epi8(luts[hh as usize], lo), m),
266                    );
267                    hh += 1;
268                }
269
270                _mm256_storeu_si256(outp.add(i) as *mut __m256i, result);
271                i += 32;
272            }
273
274            // Scalar tail
275            while i < len {
276                *outp.add(i) = *table.get_unchecked(*inp.add(i) as usize);
277                i += 1;
278            }
279        }
280    }
281
282    /// In-place general 256-byte table lookup using 16-way vpshufb.
283    ///
284    /// SAFETY: Caller must ensure AVX2 is available.
285    #[target_feature(enable = "avx2")]
286    pub unsafe fn general_lookup_inplace(data: &mut [u8], table: &[u8; 256]) {
287        unsafe {
288            use std::arch::x86_64::*;
289
290            let tp = table.as_ptr();
291            let mut luts = [_mm256_setzero_si256(); 16];
292            let mut h = 0;
293            while h < 16 {
294                luts[h] =
295                    _mm256_broadcastsi128_si256(_mm_loadu_si128(tp.add(h * 16) as *const __m128i));
296                h += 1;
297            }
298
299            let lo_mask = _mm256_set1_epi8(0x0F);
300            let len = data.len();
301            let ptr = data.as_mut_ptr();
302            let mut i = 0;
303
304            while i + 64 <= len {
305                let v0 = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
306                let v1 = _mm256_loadu_si256(ptr.add(i + 32) as *const __m256i);
307
308                let lo0 = _mm256_and_si256(v0, lo_mask);
309                let lo1 = _mm256_and_si256(v1, lo_mask);
310                let hi0 = _mm256_and_si256(_mm256_srli_epi16(v0, 4), lo_mask);
311                let hi1 = _mm256_and_si256(_mm256_srli_epi16(v1, 4), lo_mask);
312
313                let mut r0 = _mm256_setzero_si256();
314                let mut r1 = _mm256_setzero_si256();
315
316                macro_rules! do_nib {
317                    ($h:literal) => {
318                        let hv = _mm256_set1_epi8($h);
319                        let lut = luts[$h as usize];
320                        let m0 = _mm256_cmpeq_epi8(hi0, hv);
321                        let m1 = _mm256_cmpeq_epi8(hi1, hv);
322                        r0 = _mm256_or_si256(
323                            r0,
324                            _mm256_and_si256(_mm256_shuffle_epi8(lut, lo0), m0),
325                        );
326                        r1 = _mm256_or_si256(
327                            r1,
328                            _mm256_and_si256(_mm256_shuffle_epi8(lut, lo1), m1),
329                        );
330                    };
331                }
332
333                do_nib!(0);
334                do_nib!(1);
335                do_nib!(2);
336                do_nib!(3);
337                do_nib!(4);
338                do_nib!(5);
339                do_nib!(6);
340                do_nib!(7);
341                do_nib!(8);
342                do_nib!(9);
343                do_nib!(10);
344                do_nib!(11);
345                do_nib!(12);
346                do_nib!(13);
347                do_nib!(14);
348                do_nib!(15);
349
350                _mm256_storeu_si256(ptr.add(i) as *mut __m256i, r0);
351                _mm256_storeu_si256(ptr.add(i + 32) as *mut __m256i, r1);
352                i += 64;
353            }
354
355            while i + 32 <= len {
356                let v = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
357                let lo = _mm256_and_si256(v, lo_mask);
358                let hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), lo_mask);
359
360                let mut result = _mm256_setzero_si256();
361                let mut hh = 0u8;
362                while hh < 16 {
363                    let hv = _mm256_set1_epi8(hh as i8);
364                    let m = _mm256_cmpeq_epi8(hi, hv);
365                    result = _mm256_or_si256(
366                        result,
367                        _mm256_and_si256(_mm256_shuffle_epi8(luts[hh as usize], lo), m),
368                    );
369                    hh += 1;
370                }
371
372                _mm256_storeu_si256(ptr.add(i) as *mut __m256i, result);
373                i += 32;
374            }
375
376            while i < len {
377                let b = *ptr.add(i);
378                *ptr.add(i) = *table.get_unchecked(b as usize);
379                i += 1;
380            }
381        }
382    }
383
384    /// In-place SIMD translate for stdin path.
385    ///
386    /// SAFETY: Caller must ensure AVX2 is available.
387    #[target_feature(enable = "avx2")]
388    pub unsafe fn range_delta_inplace(data: &mut [u8], lo: u8, hi: u8, delta: u8) {
389        unsafe {
390            use std::arch::x86_64::*;
391
392            let lo_vec = _mm256_set1_epi8(lo as i8);
393            let range_vec = _mm256_set1_epi8((hi - lo) as i8);
394            let delta_vec = _mm256_set1_epi8(delta as i8);
395
396            let len = data.len();
397            let ptr = data.as_mut_ptr();
398            let mut i = 0usize;
399
400            while i + 128 <= len {
401                let v0 = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
402                let v1 = _mm256_loadu_si256(ptr.add(i + 32) as *const __m256i);
403                let v2 = _mm256_loadu_si256(ptr.add(i + 64) as *const __m256i);
404                let v3 = _mm256_loadu_si256(ptr.add(i + 96) as *const __m256i);
405
406                let d0 = _mm256_sub_epi8(v0, lo_vec);
407                let d1 = _mm256_sub_epi8(v1, lo_vec);
408                let d2 = _mm256_sub_epi8(v2, lo_vec);
409                let d3 = _mm256_sub_epi8(v3, lo_vec);
410
411                let m0 = _mm256_cmpeq_epi8(_mm256_min_epu8(d0, range_vec), d0);
412                let m1 = _mm256_cmpeq_epi8(_mm256_min_epu8(d1, range_vec), d1);
413                let m2 = _mm256_cmpeq_epi8(_mm256_min_epu8(d2, range_vec), d2);
414                let m3 = _mm256_cmpeq_epi8(_mm256_min_epu8(d3, range_vec), d3);
415
416                let r0 = _mm256_add_epi8(v0, _mm256_and_si256(m0, delta_vec));
417                let r1 = _mm256_add_epi8(v1, _mm256_and_si256(m1, delta_vec));
418                let r2 = _mm256_add_epi8(v2, _mm256_and_si256(m2, delta_vec));
419                let r3 = _mm256_add_epi8(v3, _mm256_and_si256(m3, delta_vec));
420
421                _mm256_storeu_si256(ptr.add(i) as *mut __m256i, r0);
422                _mm256_storeu_si256(ptr.add(i + 32) as *mut __m256i, r1);
423                _mm256_storeu_si256(ptr.add(i + 64) as *mut __m256i, r2);
424                _mm256_storeu_si256(ptr.add(i + 96) as *mut __m256i, r3);
425                i += 128;
426            }
427
428            while i + 32 <= len {
429                let v = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
430                let diff = _mm256_sub_epi8(v, lo_vec);
431                let mask = _mm256_cmpeq_epi8(_mm256_min_epu8(diff, range_vec), diff);
432                let result = _mm256_add_epi8(v, _mm256_and_si256(mask, delta_vec));
433                _mm256_storeu_si256(ptr.add(i) as *mut __m256i, result);
434                i += 32;
435            }
436
437            while i < len {
438                let b = *ptr.add(i);
439                *ptr.add(i) = if b.wrapping_sub(lo) <= (hi - lo) {
440                    b.wrapping_add(delta)
441                } else {
442                    b
443                };
444                i += 1;
445            }
446        }
447    }
448}
449
450/// AVX2 nibble-based set membership classifier.
451/// Uses vpshufb to test 32 bytes at a time for membership in a byte set.
452/// Returns a `__m256i` where each byte is 0xFF if the byte is NOT in the set, 0x00 if it IS.
453/// Check if AVX2 is available at runtime.
454#[cfg(target_arch = "x86_64")]
455#[inline(always)]
456fn has_avx2() -> bool {
457    is_x86_feature_detected!("avx2")
458}
459
460/// Translate a chunk of bytes using a lookup table — unrolled 8-byte inner loop.
461#[inline(always)]
462fn translate_chunk(chunk: &[u8], out: &mut [u8], table: &[u8; 256]) {
463    let len = chunk.len();
464    let mut i = 0;
465    while i + 8 <= len {
466        unsafe {
467            *out.get_unchecked_mut(i) = *table.get_unchecked(*chunk.get_unchecked(i) as usize);
468            *out.get_unchecked_mut(i + 1) =
469                *table.get_unchecked(*chunk.get_unchecked(i + 1) as usize);
470            *out.get_unchecked_mut(i + 2) =
471                *table.get_unchecked(*chunk.get_unchecked(i + 2) as usize);
472            *out.get_unchecked_mut(i + 3) =
473                *table.get_unchecked(*chunk.get_unchecked(i + 3) as usize);
474            *out.get_unchecked_mut(i + 4) =
475                *table.get_unchecked(*chunk.get_unchecked(i + 4) as usize);
476            *out.get_unchecked_mut(i + 5) =
477                *table.get_unchecked(*chunk.get_unchecked(i + 5) as usize);
478            *out.get_unchecked_mut(i + 6) =
479                *table.get_unchecked(*chunk.get_unchecked(i + 6) as usize);
480            *out.get_unchecked_mut(i + 7) =
481                *table.get_unchecked(*chunk.get_unchecked(i + 7) as usize);
482        }
483        i += 8;
484    }
485    while i < len {
486        unsafe {
487            *out.get_unchecked_mut(i) = *table.get_unchecked(*chunk.get_unchecked(i) as usize);
488        }
489        i += 1;
490    }
491}
492
493/// In-place translate for stdin path — avoids separate output buffer.
494#[inline(always)]
495fn translate_inplace(data: &mut [u8], table: &[u8; 256]) {
496    let len = data.len();
497    let ptr = data.as_mut_ptr();
498    let tab = table.as_ptr();
499
500    unsafe {
501        let mut i = 0;
502        while i + 8 <= len {
503            *ptr.add(i) = *tab.add(*ptr.add(i) as usize);
504            *ptr.add(i + 1) = *tab.add(*ptr.add(i + 1) as usize);
505            *ptr.add(i + 2) = *tab.add(*ptr.add(i + 2) as usize);
506            *ptr.add(i + 3) = *tab.add(*ptr.add(i + 3) as usize);
507            *ptr.add(i + 4) = *tab.add(*ptr.add(i + 4) as usize);
508            *ptr.add(i + 5) = *tab.add(*ptr.add(i + 5) as usize);
509            *ptr.add(i + 6) = *tab.add(*ptr.add(i + 6) as usize);
510            *ptr.add(i + 7) = *tab.add(*ptr.add(i + 7) as usize);
511            i += 8;
512        }
513        while i < len {
514            *ptr.add(i) = *tab.add(*ptr.add(i) as usize);
515            i += 1;
516        }
517    }
518}
519
520// ============================================================================
521// Dispatch: choose SIMD or scalar based on table analysis
522// ============================================================================
523
524/// Translate a chunk using the best available method.
525/// `use_simd` is cached at call site to avoid per-chunk atomic loads.
526#[inline]
527fn translate_chunk_dispatch(
528    chunk: &[u8],
529    out: &mut [u8],
530    table: &[u8; 256],
531    kind: &TranslateKind,
532    _use_simd: bool,
533) {
534    match kind {
535        TranslateKind::Identity => {
536            out[..chunk.len()].copy_from_slice(chunk);
537        }
538        #[cfg(target_arch = "x86_64")]
539        TranslateKind::RangeDelta { lo, hi, delta } => {
540            if _use_simd {
541                unsafe { simd_tr::range_delta(chunk, out, *lo, *hi, *delta) };
542                return;
543            }
544            translate_chunk(chunk, out, table);
545        }
546        #[cfg(not(target_arch = "x86_64"))]
547        TranslateKind::RangeDelta { .. } => {
548            translate_chunk(chunk, out, table);
549        }
550        #[cfg(target_arch = "x86_64")]
551        TranslateKind::General => {
552            if _use_simd {
553                unsafe { simd_tr::general_lookup(chunk, out, table) };
554                return;
555            }
556            translate_chunk(chunk, out, table);
557        }
558        #[cfg(not(target_arch = "x86_64"))]
559        TranslateKind::General => {
560            translate_chunk(chunk, out, table);
561        }
562    }
563}
564
565/// In-place translate dispatch.
566/// `use_simd` is cached at call site to avoid per-chunk atomic loads.
567#[inline]
568fn translate_inplace_dispatch(
569    data: &mut [u8],
570    table: &[u8; 256],
571    kind: &TranslateKind,
572    _use_simd: bool,
573) {
574    match kind {
575        TranslateKind::Identity => {}
576        #[cfg(target_arch = "x86_64")]
577        TranslateKind::RangeDelta { lo, hi, delta } => {
578            if _use_simd {
579                unsafe { simd_tr::range_delta_inplace(data, *lo, *hi, *delta) };
580                return;
581            }
582            translate_inplace(data, table);
583        }
584        #[cfg(not(target_arch = "x86_64"))]
585        TranslateKind::RangeDelta { .. } => {
586            translate_inplace(data, table);
587        }
588        #[cfg(target_arch = "x86_64")]
589        TranslateKind::General => {
590            if _use_simd {
591                unsafe { simd_tr::general_lookup_inplace(data, table) };
592                return;
593            }
594            translate_inplace(data, table);
595        }
596        #[cfg(not(target_arch = "x86_64"))]
597        TranslateKind::General => {
598            translate_inplace(data, table);
599        }
600    }
601}
602
603// ============================================================================
604// Streaming functions (Read + Write)
605// Process data immediately after each read() — no fill_buf accumulation.
606// Uses 256KB buffer (L2-friendly) instead of 8MB.
607// ============================================================================
608
609pub fn translate(
610    set1: &[u8],
611    set2: &[u8],
612    reader: &mut impl Read,
613    writer: &mut impl Write,
614) -> io::Result<()> {
615    let table = build_translate_table(set1, set2);
616    let kind = analyze_table(&table);
617    #[cfg(target_arch = "x86_64")]
618    let use_simd = has_avx2();
619    #[cfg(not(target_arch = "x86_64"))]
620    let use_simd = false;
621
622    // Use 1MB buffer for fewer read() syscalls on pipes
623    let mut buf = vec![0u8; BUF_SIZE];
624    loop {
625        let n = match reader.read(&mut buf) {
626            Ok(0) => break,
627            Ok(n) => n,
628            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
629            Err(e) => return Err(e),
630        };
631        translate_inplace_dispatch(&mut buf[..n], &table, &kind, use_simd);
632        writer.write_all(&buf[..n])?;
633    }
634    Ok(())
635}
636
637pub fn translate_squeeze(
638    set1: &[u8],
639    set2: &[u8],
640    reader: &mut impl Read,
641    writer: &mut impl Write,
642) -> io::Result<()> {
643    let table = build_translate_table(set1, set2);
644    let squeeze_set = build_member_set(set2);
645    let kind = analyze_table(&table);
646    #[cfg(target_arch = "x86_64")]
647    let use_simd = has_avx2();
648    #[cfg(not(target_arch = "x86_64"))]
649    let use_simd = false;
650
651    // Single buffer: SIMD translate in-place, then squeeze in-place compaction.
652    // Eliminates outbuf allocation and saves one full memcpy of data.
653    let mut buf = vec![0u8; STREAM_BUF];
654    let mut last_squeezed: u16 = 256;
655
656    loop {
657        let n = match reader.read(&mut buf) {
658            Ok(0) => break,
659            Ok(n) => n,
660            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
661            Err(e) => return Err(e),
662        };
663        // Phase 1: SIMD translate in-place (uses AVX2 when available)
664        translate_inplace_dispatch(&mut buf[..n], &table, &kind, use_simd);
665        // Phase 2: squeeze in-place compaction (wp <= i always, safe)
666        let mut wp = 0;
667        unsafe {
668            let ptr = buf.as_mut_ptr();
669            for i in 0..n {
670                let b = *ptr.add(i);
671                if is_member(&squeeze_set, b) {
672                    if last_squeezed == b as u16 {
673                        continue;
674                    }
675                    last_squeezed = b as u16;
676                } else {
677                    last_squeezed = 256;
678                }
679                *ptr.add(wp) = b;
680                wp += 1;
681            }
682        }
683        writer.write_all(&buf[..wp])?;
684    }
685    Ok(())
686}
687
688pub fn delete(
689    delete_chars: &[u8],
690    reader: &mut impl Read,
691    writer: &mut impl Write,
692) -> io::Result<()> {
693    // Fast path: single character delete using SIMD memchr
694    if delete_chars.len() == 1 {
695        return delete_single_streaming(delete_chars[0], reader, writer);
696    }
697
698    // Fast paths: 2-3 char delete using SIMD memchr2/memchr3
699    if delete_chars.len() <= 3 {
700        return delete_multi_streaming(delete_chars, reader, writer);
701    }
702
703    let member = build_member_set(delete_chars);
704    // Single buffer with in-place compaction — eliminates outbuf allocation + memcpy
705    let mut buf = vec![0u8; STREAM_BUF];
706
707    loop {
708        let n = match reader.read(&mut buf) {
709            Ok(0) => break,
710            Ok(n) => n,
711            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
712            Err(e) => return Err(e),
713        };
714        let mut wp = 0;
715        unsafe {
716            let ptr = buf.as_mut_ptr();
717            let mut i = 0;
718            // 8-byte unrolled in-place compaction
719            while i + 8 <= n {
720                let b0 = *ptr.add(i);
721                let b1 = *ptr.add(i + 1);
722                let b2 = *ptr.add(i + 2);
723                let b3 = *ptr.add(i + 3);
724                let b4 = *ptr.add(i + 4);
725                let b5 = *ptr.add(i + 5);
726                let b6 = *ptr.add(i + 6);
727                let b7 = *ptr.add(i + 7);
728
729                if !is_member(&member, b0) {
730                    *ptr.add(wp) = b0;
731                    wp += 1;
732                }
733                if !is_member(&member, b1) {
734                    *ptr.add(wp) = b1;
735                    wp += 1;
736                }
737                if !is_member(&member, b2) {
738                    *ptr.add(wp) = b2;
739                    wp += 1;
740                }
741                if !is_member(&member, b3) {
742                    *ptr.add(wp) = b3;
743                    wp += 1;
744                }
745                if !is_member(&member, b4) {
746                    *ptr.add(wp) = b4;
747                    wp += 1;
748                }
749                if !is_member(&member, b5) {
750                    *ptr.add(wp) = b5;
751                    wp += 1;
752                }
753                if !is_member(&member, b6) {
754                    *ptr.add(wp) = b6;
755                    wp += 1;
756                }
757                if !is_member(&member, b7) {
758                    *ptr.add(wp) = b7;
759                    wp += 1;
760                }
761                i += 8;
762            }
763            while i < n {
764                let b = *ptr.add(i);
765                if !is_member(&member, b) {
766                    *ptr.add(wp) = b;
767                    wp += 1;
768                }
769                i += 1;
770            }
771        }
772        writer.write_all(&buf[..wp])?;
773    }
774    Ok(())
775}
776
777/// Single-character delete from a reader — in-place compaction with SIMD memchr.
778/// Uses memchr for SIMD scanning + ptr::copy for in-place shift + single write_all.
779fn delete_single_streaming(
780    ch: u8,
781    reader: &mut impl Read,
782    writer: &mut impl Write,
783) -> io::Result<()> {
784    let mut buf = vec![0u8; STREAM_BUF];
785    loop {
786        let n = match reader.read(&mut buf) {
787            Ok(0) => break,
788            Ok(n) => n,
789            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
790            Err(e) => return Err(e),
791        };
792        let mut wp = 0;
793        let mut i = 0;
794        while i < n {
795            match memchr::memchr(ch, &buf[i..n]) {
796                Some(offset) => {
797                    if offset > 0 {
798                        if wp != i {
799                            unsafe {
800                                std::ptr::copy(
801                                    buf.as_ptr().add(i),
802                                    buf.as_mut_ptr().add(wp),
803                                    offset,
804                                );
805                            }
806                        }
807                        wp += offset;
808                    }
809                    i += offset + 1; // skip the deleted char
810                }
811                None => {
812                    let run_len = n - i;
813                    if run_len > 0 {
814                        if wp != i {
815                            unsafe {
816                                std::ptr::copy(
817                                    buf.as_ptr().add(i),
818                                    buf.as_mut_ptr().add(wp),
819                                    run_len,
820                                );
821                            }
822                        }
823                        wp += run_len;
824                    }
825                    break;
826                }
827            }
828        }
829        writer.write_all(&buf[..wp])?;
830    }
831    Ok(())
832}
833
834/// Multi-character delete (2-3 chars) — in-place compaction with SIMD memchr2/memchr3.
835fn delete_multi_streaming(
836    chars: &[u8],
837    reader: &mut impl Read,
838    writer: &mut impl Write,
839) -> io::Result<()> {
840    let mut buf = vec![0u8; STREAM_BUF];
841    loop {
842        let n = match reader.read(&mut buf) {
843            Ok(0) => break,
844            Ok(n) => n,
845            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
846            Err(e) => return Err(e),
847        };
848        let mut wp = 0;
849        let mut i = 0;
850        while i < n {
851            let found = if chars.len() == 2 {
852                memchr::memchr2(chars[0], chars[1], &buf[i..n])
853            } else {
854                memchr::memchr3(chars[0], chars[1], chars[2], &buf[i..n])
855            };
856            match found {
857                Some(offset) => {
858                    if offset > 0 {
859                        if wp != i {
860                            unsafe {
861                                std::ptr::copy(
862                                    buf.as_ptr().add(i),
863                                    buf.as_mut_ptr().add(wp),
864                                    offset,
865                                );
866                            }
867                        }
868                        wp += offset;
869                    }
870                    i += offset + 1;
871                }
872                None => {
873                    let run_len = n - i;
874                    if run_len > 0 {
875                        if wp != i {
876                            unsafe {
877                                std::ptr::copy(
878                                    buf.as_ptr().add(i),
879                                    buf.as_mut_ptr().add(wp),
880                                    run_len,
881                                );
882                            }
883                        }
884                        wp += run_len;
885                    }
886                    break;
887                }
888            }
889        }
890        writer.write_all(&buf[..wp])?;
891    }
892    Ok(())
893}
894
895pub fn delete_squeeze(
896    delete_chars: &[u8],
897    squeeze_chars: &[u8],
898    reader: &mut impl Read,
899    writer: &mut impl Write,
900) -> io::Result<()> {
901    let delete_set = build_member_set(delete_chars);
902    let squeeze_set = build_member_set(squeeze_chars);
903    // Single buffer with in-place compaction
904    let mut buf = vec![0u8; STREAM_BUF];
905    let mut last_squeezed: u16 = 256;
906
907    loop {
908        let n = match reader.read(&mut buf) {
909            Ok(0) => break,
910            Ok(n) => n,
911            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
912            Err(e) => return Err(e),
913        };
914        let mut wp = 0;
915        unsafe {
916            let ptr = buf.as_mut_ptr();
917            for i in 0..n {
918                let b = *ptr.add(i);
919                if is_member(&delete_set, b) {
920                    continue;
921                }
922                if is_member(&squeeze_set, b) {
923                    if last_squeezed == b as u16 {
924                        continue;
925                    }
926                    last_squeezed = b as u16;
927                } else {
928                    last_squeezed = 256;
929                }
930                *ptr.add(wp) = b;
931                wp += 1;
932            }
933        }
934        writer.write_all(&buf[..wp])?;
935    }
936    Ok(())
937}
938
939pub fn squeeze(
940    squeeze_chars: &[u8],
941    reader: &mut impl Read,
942    writer: &mut impl Write,
943) -> io::Result<()> {
944    // Fast path: single squeeze char — bulk copy non-match runs
945    if squeeze_chars.len() == 1 {
946        return squeeze_single_stream(squeeze_chars[0], reader, writer);
947    }
948
949    let member = build_member_set(squeeze_chars);
950    // Single buffer with in-place compaction — eliminates outbuf allocation + memcpy
951    let mut buf = vec![0u8; STREAM_BUF];
952    let mut last_squeezed: u16 = 256;
953
954    loop {
955        let n = match reader.read(&mut buf) {
956            Ok(0) => break,
957            Ok(n) => n,
958            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
959            Err(e) => return Err(e),
960        };
961        let mut wp = 0;
962        unsafe {
963            let ptr = buf.as_mut_ptr();
964            for i in 0..n {
965                let b = *ptr.add(i);
966                if is_member(&member, b) {
967                    if last_squeezed == b as u16 {
968                        continue;
969                    }
970                    last_squeezed = b as u16;
971                } else {
972                    last_squeezed = 256;
973                }
974                *ptr.add(wp) = b;
975                wp += 1;
976            }
977        }
978        writer.write_all(&buf[..wp])?;
979    }
980    Ok(())
981}
982
983/// Squeeze a single character from a stream — in-place compaction with SIMD memchr.
984/// Single buffer: eliminates outbuf allocation and saves one full memcpy.
985/// Uses memchr for fast SIMD scanning, ptr::copy for in-place shift.
986fn squeeze_single_stream(
987    ch: u8,
988    reader: &mut impl Read,
989    writer: &mut impl Write,
990) -> io::Result<()> {
991    let mut buf = vec![0u8; STREAM_BUF];
992    let mut was_squeeze_char = false;
993
994    loop {
995        let n = match reader.read(&mut buf) {
996            Ok(0) => break,
997            Ok(n) => n,
998            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
999            Err(e) => return Err(e),
1000        };
1001
1002        // In-place squeeze compaction using memchr for SIMD scanning.
1003        // wp tracks write position (always <= read position i), so in-place is safe.
1004        let mut wp = 0;
1005        let mut i = 0;
1006
1007        while i < n {
1008            // Cross-chunk continuation: skip squeeze chars from previous chunk
1009            if was_squeeze_char && buf[i] == ch {
1010                i += 1;
1011                while i < n && buf[i] == ch {
1012                    i += 1;
1013                }
1014                // was_squeeze_char stays true until we see a non-squeeze char
1015                if i >= n {
1016                    break;
1017                }
1018            }
1019
1020            // Find next occurrence of squeeze char using SIMD memchr
1021            match memchr::memchr(ch, &buf[i..n]) {
1022                Some(offset) => {
1023                    let run_len = offset;
1024                    // Shift non-squeeze run left in-place (skip if already in position)
1025                    if run_len > 0 {
1026                        if wp != i {
1027                            unsafe {
1028                                std::ptr::copy(
1029                                    buf.as_ptr().add(i),
1030                                    buf.as_mut_ptr().add(wp),
1031                                    run_len,
1032                                );
1033                            }
1034                        }
1035                        wp += run_len;
1036                    }
1037                    i += run_len;
1038
1039                    // Emit one squeeze char, skip consecutive duplicates
1040                    unsafe {
1041                        *buf.as_mut_ptr().add(wp) = ch;
1042                    }
1043                    wp += 1;
1044                    was_squeeze_char = true;
1045                    i += 1;
1046                    while i < n && buf[i] == ch {
1047                        i += 1;
1048                    }
1049                }
1050                None => {
1051                    // No more squeeze chars — shift remaining data
1052                    let run_len = n - i;
1053                    if run_len > 0 {
1054                        if wp != i {
1055                            unsafe {
1056                                std::ptr::copy(
1057                                    buf.as_ptr().add(i),
1058                                    buf.as_mut_ptr().add(wp),
1059                                    run_len,
1060                                );
1061                            }
1062                        }
1063                        wp += run_len;
1064                    }
1065                    was_squeeze_char = false;
1066                    break;
1067                }
1068            }
1069        }
1070
1071        writer.write_all(&buf[..wp])?;
1072    }
1073    Ok(())
1074}
1075
1076// ============================================================================
1077// Mmap-based functions (zero-copy input from byte slice)
1078// ============================================================================
1079
1080/// Translate bytes from an mmap'd byte slice — zero syscall reads.
1081/// Uses SIMD AVX2 for range-delta patterns (e.g., a-z → A-Z).
1082/// For large data: parallel translation with rayon + single write_all syscall.
1083/// For small data: sequential chunked processing.
1084pub fn translate_mmap(
1085    set1: &[u8],
1086    set2: &[u8],
1087    data: &[u8],
1088    writer: &mut impl Write,
1089) -> io::Result<()> {
1090    let table = build_translate_table(set1, set2);
1091    let kind = analyze_table(&table);
1092    #[cfg(target_arch = "x86_64")]
1093    let use_simd = has_avx2();
1094    #[cfg(not(target_arch = "x86_64"))]
1095    let use_simd = false;
1096
1097    if matches!(kind, TranslateKind::Identity) {
1098        return writer.write_all(data);
1099    }
1100
1101    // Parallel path for large data: translate all chunks in parallel with rayon,
1102    // then single write_all. Eliminates per-chunk write() syscalls (100→1 for 100MB).
1103    if data.len() >= PAR_THRESHOLD {
1104        let mut out = vec![0u8; data.len()];
1105        let n = rayon::current_num_threads().max(1);
1106        let cs = (data.len() / n).max(BUF_SIZE);
1107        {
1108            data.par_chunks(cs)
1109                .zip(out.par_chunks_mut(cs))
1110                .for_each(|(inp, outp)| {
1111                    translate_chunk_dispatch(inp, &mut outp[..inp.len()], &table, &kind, use_simd);
1112                });
1113        }
1114        return writer.write_all(&out);
1115    }
1116
1117    // Small files: sequential chunked
1118    let buf_size = data.len().min(BUF_SIZE);
1119    let mut out = vec![0u8; buf_size];
1120    for chunk in data.chunks(buf_size) {
1121        translate_chunk_dispatch(chunk, &mut out[..chunk.len()], &table, &kind, use_simd);
1122        writer.write_all(&out[..chunk.len()])?;
1123    }
1124    Ok(())
1125}
1126
1127/// Translate + squeeze from mmap'd byte slice.
1128/// For large data: parallel translate with rayon, then sequential squeeze + write.
1129/// For small data: sequential translate + squeeze in-place.
1130pub fn translate_squeeze_mmap(
1131    set1: &[u8],
1132    set2: &[u8],
1133    data: &[u8],
1134    writer: &mut impl Write,
1135) -> io::Result<()> {
1136    let table = build_translate_table(set1, set2);
1137    let squeeze_set = build_member_set(set2);
1138    let kind = analyze_table(&table);
1139    #[cfg(target_arch = "x86_64")]
1140    let use_simd = has_avx2();
1141    #[cfg(not(target_arch = "x86_64"))]
1142    let use_simd = false;
1143
1144    // Large data: parallel translate, then sequential squeeze
1145    if data.len() >= PAR_THRESHOLD {
1146        let mut buf = vec![0u8; data.len()];
1147        let n = rayon::current_num_threads().max(1);
1148        let cs = (data.len() / n).max(BUF_SIZE);
1149        {
1150            data.par_chunks(cs)
1151                .zip(buf.par_chunks_mut(cs))
1152                .for_each(|(inp, outp)| {
1153                    translate_chunk_dispatch(inp, &mut outp[..inp.len()], &table, &kind, use_simd);
1154                });
1155        }
1156        // Sequential squeeze + write in chunks (squeeze has cross-boundary state)
1157        let mut last_squeezed: u16 = 256;
1158        let mut outbuf = vec![0u8; BUF_SIZE];
1159        for chunk in buf.chunks(BUF_SIZE) {
1160            let mut wp = 0;
1161            unsafe {
1162                let ptr = chunk.as_ptr();
1163                let outp = outbuf.as_mut_ptr();
1164                for i in 0..chunk.len() {
1165                    let b = *ptr.add(i);
1166                    if is_member(&squeeze_set, b) {
1167                        if last_squeezed == b as u16 {
1168                            continue;
1169                        }
1170                        last_squeezed = b as u16;
1171                    } else {
1172                        last_squeezed = 256;
1173                    }
1174                    *outp.add(wp) = b;
1175                    wp += 1;
1176                }
1177            }
1178            writer.write_all(&outbuf[..wp])?;
1179        }
1180        return Ok(());
1181    }
1182
1183    // Small data: sequential translate + squeeze in-place
1184    let buf_size = data.len().min(BUF_SIZE);
1185    let mut buf = vec![0u8; buf_size];
1186    let mut last_squeezed: u16 = 256;
1187
1188    for chunk in data.chunks(buf_size) {
1189        translate_chunk_dispatch(chunk, &mut buf[..chunk.len()], &table, &kind, use_simd);
1190        let mut wp = 0;
1191        unsafe {
1192            let ptr = buf.as_mut_ptr();
1193            for i in 0..chunk.len() {
1194                let b = *ptr.add(i);
1195                if is_member(&squeeze_set, b) {
1196                    if last_squeezed == b as u16 {
1197                        continue;
1198                    }
1199                    last_squeezed = b as u16;
1200                } else {
1201                    last_squeezed = 256;
1202                }
1203                *ptr.add(wp) = b;
1204                wp += 1;
1205            }
1206        }
1207        writer.write_all(&buf[..wp])?;
1208    }
1209    Ok(())
1210}
1211
1212/// Delete from mmap'd byte slice.
1213/// Uses parallel processing + SIMD memchr for maximum throughput.
1214pub fn delete_mmap(delete_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1215    // Fast path: single character delete uses SIMD memchr
1216    if delete_chars.len() == 1 {
1217        return delete_single_char_mmap(delete_chars[0], data, writer);
1218    }
1219
1220    // Fast path: 2-char delete uses SIMD memchr2 (bulk copy between matches)
1221    if delete_chars.len() == 2 {
1222        return delete_multi_memchr_mmap::<2>(delete_chars, data, writer);
1223    }
1224
1225    // Fast path: 3-char delete uses SIMD memchr3 (bulk copy between matches)
1226    if delete_chars.len() == 3 {
1227        return delete_multi_memchr_mmap::<3>(delete_chars, data, writer);
1228    }
1229
1230    let member = build_member_set(delete_chars);
1231
1232    // Parallel path: each thread processes its chunk independently
1233    if data.len() >= PAR_THRESHOLD {
1234        let n = rayon::current_num_threads().max(1);
1235        let cs = (data.len() / n).max(BUF_SIZE);
1236        let results: Vec<Vec<u8>> = data
1237            .par_chunks(cs)
1238            .map(|chunk| {
1239                let mut outbuf: Vec<u8> = Vec::with_capacity(chunk.len());
1240                let len = chunk.len();
1241                let mut i = 0;
1242                unsafe {
1243                    outbuf.set_len(chunk.len());
1244                    let mut out_pos = 0;
1245                    let outp: *mut u8 = outbuf.as_mut_ptr();
1246                    while i + 8 <= len {
1247                        let b0 = *chunk.get_unchecked(i);
1248                        let b1 = *chunk.get_unchecked(i + 1);
1249                        let b2 = *chunk.get_unchecked(i + 2);
1250                        let b3 = *chunk.get_unchecked(i + 3);
1251                        let b4 = *chunk.get_unchecked(i + 4);
1252                        let b5 = *chunk.get_unchecked(i + 5);
1253                        let b6 = *chunk.get_unchecked(i + 6);
1254                        let b7 = *chunk.get_unchecked(i + 7);
1255                        *outp.add(out_pos) = b0;
1256                        out_pos += !is_member(&member, b0) as usize;
1257                        *outp.add(out_pos) = b1;
1258                        out_pos += !is_member(&member, b1) as usize;
1259                        *outp.add(out_pos) = b2;
1260                        out_pos += !is_member(&member, b2) as usize;
1261                        *outp.add(out_pos) = b3;
1262                        out_pos += !is_member(&member, b3) as usize;
1263                        *outp.add(out_pos) = b4;
1264                        out_pos += !is_member(&member, b4) as usize;
1265                        *outp.add(out_pos) = b5;
1266                        out_pos += !is_member(&member, b5) as usize;
1267                        *outp.add(out_pos) = b6;
1268                        out_pos += !is_member(&member, b6) as usize;
1269                        *outp.add(out_pos) = b7;
1270                        out_pos += !is_member(&member, b7) as usize;
1271                        i += 8;
1272                    }
1273                    while i < len {
1274                        let b = *chunk.get_unchecked(i);
1275                        *outp.add(out_pos) = b;
1276                        out_pos += !is_member(&member, b) as usize;
1277                        i += 1;
1278                    }
1279                    outbuf.set_len(out_pos);
1280                }
1281                outbuf
1282            })
1283            .collect();
1284        for r in &results {
1285            if !r.is_empty() {
1286                writer.write_all(r)?;
1287            }
1288        }
1289        return Ok(());
1290    }
1291
1292    // Small data: sequential
1293    let buf_size = data.len().min(BUF_SIZE);
1294    let mut outbuf = vec![0u8; buf_size];
1295
1296    for chunk in data.chunks(buf_size) {
1297        let mut out_pos = 0;
1298        let len = chunk.len();
1299        let mut i = 0;
1300
1301        while i + 8 <= len {
1302            unsafe {
1303                let b0 = *chunk.get_unchecked(i);
1304                let b1 = *chunk.get_unchecked(i + 1);
1305                let b2 = *chunk.get_unchecked(i + 2);
1306                let b3 = *chunk.get_unchecked(i + 3);
1307                let b4 = *chunk.get_unchecked(i + 4);
1308                let b5 = *chunk.get_unchecked(i + 5);
1309                let b6 = *chunk.get_unchecked(i + 6);
1310                let b7 = *chunk.get_unchecked(i + 7);
1311
1312                *outbuf.get_unchecked_mut(out_pos) = b0;
1313                out_pos += !is_member(&member, b0) as usize;
1314                *outbuf.get_unchecked_mut(out_pos) = b1;
1315                out_pos += !is_member(&member, b1) as usize;
1316                *outbuf.get_unchecked_mut(out_pos) = b2;
1317                out_pos += !is_member(&member, b2) as usize;
1318                *outbuf.get_unchecked_mut(out_pos) = b3;
1319                out_pos += !is_member(&member, b3) as usize;
1320                *outbuf.get_unchecked_mut(out_pos) = b4;
1321                out_pos += !is_member(&member, b4) as usize;
1322                *outbuf.get_unchecked_mut(out_pos) = b5;
1323                out_pos += !is_member(&member, b5) as usize;
1324                *outbuf.get_unchecked_mut(out_pos) = b6;
1325                out_pos += !is_member(&member, b6) as usize;
1326                *outbuf.get_unchecked_mut(out_pos) = b7;
1327                out_pos += !is_member(&member, b7) as usize;
1328            }
1329            i += 8;
1330        }
1331
1332        while i < len {
1333            unsafe {
1334                let b = *chunk.get_unchecked(i);
1335                *outbuf.get_unchecked_mut(out_pos) = b;
1336                out_pos += !is_member(&member, b) as usize;
1337            }
1338            i += 1;
1339        }
1340
1341        writer.write_all(&outbuf[..out_pos])?;
1342    }
1343    Ok(())
1344}
1345
1346/// Multi-character delete (2-3 chars) using SIMD memchr2/memchr3.
1347/// Parallel: each thread processes a chunk independently.
1348fn delete_multi_memchr_mmap<const N: usize>(
1349    chars: &[u8],
1350    data: &[u8],
1351    writer: &mut impl Write,
1352) -> io::Result<()> {
1353    let c0 = chars[0];
1354    let c1 = if N >= 2 { chars[1] } else { 0 };
1355    let c2 = if N >= 3 { chars[2] } else { 0 };
1356
1357    // Parallel path
1358    if data.len() >= PAR_THRESHOLD {
1359        let n = rayon::current_num_threads().max(1);
1360        let cs = (data.len() / n).max(BUF_SIZE);
1361        let results: Vec<Vec<u8>> = data
1362            .par_chunks(cs)
1363            .map(|chunk| {
1364                let mut out = Vec::with_capacity(chunk.len());
1365                let mut last = 0;
1366                macro_rules! process_iter {
1367                    ($iter:expr) => {
1368                        for pos in $iter {
1369                            if pos > last {
1370                                out.extend_from_slice(&chunk[last..pos]);
1371                            }
1372                            last = pos + 1;
1373                        }
1374                    };
1375                }
1376                if N == 2 {
1377                    process_iter!(memchr::memchr2_iter(c0, c1, chunk));
1378                } else {
1379                    process_iter!(memchr::memchr3_iter(c0, c1, c2, chunk));
1380                }
1381                if last < chunk.len() {
1382                    out.extend_from_slice(&chunk[last..]);
1383                }
1384                out
1385            })
1386            .collect();
1387        for r in &results {
1388            if !r.is_empty() {
1389                writer.write_all(r)?;
1390            }
1391        }
1392        return Ok(());
1393    }
1394
1395    // Small data: sequential
1396    let buf_size = data.len().min(BUF_SIZE);
1397    let mut outbuf = vec![0u8; buf_size];
1398
1399    for chunk in data.chunks(buf_size) {
1400        let mut wp = 0;
1401        let mut last = 0;
1402
1403        macro_rules! process_iter {
1404            ($iter:expr) => {
1405                for pos in $iter {
1406                    if pos > last {
1407                        let run = pos - last;
1408                        outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
1409                        wp += run;
1410                    }
1411                    last = pos + 1;
1412                }
1413            };
1414        }
1415
1416        if N == 2 {
1417            process_iter!(memchr::memchr2_iter(c0, c1, chunk));
1418        } else {
1419            process_iter!(memchr::memchr3_iter(c0, c1, c2, chunk));
1420        }
1421
1422        if last < chunk.len() {
1423            let run = chunk.len() - last;
1424            outbuf[wp..wp + run].copy_from_slice(&chunk[last..]);
1425            wp += run;
1426        }
1427        writer.write_all(&outbuf[..wp])?;
1428    }
1429    Ok(())
1430}
1431
1432/// Single-character delete from mmap using SIMD memchr.
1433/// Parallel: each thread processes a chunk independently with memchr_iter + bulk copy.
1434fn delete_single_char_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1435    // Parallel path: each chunk independently processed
1436    if data.len() >= PAR_THRESHOLD {
1437        let n = rayon::current_num_threads().max(1);
1438        let cs = (data.len() / n).max(BUF_SIZE);
1439        let results: Vec<Vec<u8>> = data
1440            .par_chunks(cs)
1441            .map(|chunk| {
1442                let mut out = Vec::with_capacity(chunk.len());
1443                let mut last = 0;
1444                for pos in memchr::memchr_iter(ch, chunk) {
1445                    if pos > last {
1446                        out.extend_from_slice(&chunk[last..pos]);
1447                    }
1448                    last = pos + 1;
1449                }
1450                if last < chunk.len() {
1451                    out.extend_from_slice(&chunk[last..]);
1452                }
1453                out
1454            })
1455            .collect();
1456        for r in &results {
1457            if !r.is_empty() {
1458                writer.write_all(r)?;
1459            }
1460        }
1461        return Ok(());
1462    }
1463
1464    // Small data: sequential
1465    let buf_size = data.len().min(BUF_SIZE);
1466    let mut outbuf = vec![0u8; buf_size];
1467
1468    for chunk in data.chunks(buf_size) {
1469        let mut wp = 0;
1470        let mut last = 0;
1471        for pos in memchr::memchr_iter(ch, chunk) {
1472            if pos > last {
1473                let run = pos - last;
1474                outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
1475                wp += run;
1476            }
1477            last = pos + 1;
1478        }
1479        if last < chunk.len() {
1480            let run = chunk.len() - last;
1481            outbuf[wp..wp + run].copy_from_slice(&chunk[last..]);
1482            wp += run;
1483        }
1484        writer.write_all(&outbuf[..wp])?;
1485    }
1486    Ok(())
1487}
1488
1489/// Delete + squeeze from mmap'd byte slice.
1490pub fn delete_squeeze_mmap(
1491    delete_chars: &[u8],
1492    squeeze_chars: &[u8],
1493    data: &[u8],
1494    writer: &mut impl Write,
1495) -> io::Result<()> {
1496    let delete_set = build_member_set(delete_chars);
1497    let squeeze_set = build_member_set(squeeze_chars);
1498    let buf_size = data.len().min(BUF_SIZE);
1499    let mut outbuf = vec![0u8; buf_size];
1500    let mut last_squeezed: u16 = 256;
1501
1502    for chunk in data.chunks(buf_size) {
1503        let mut out_pos = 0;
1504        for &b in chunk {
1505            if is_member(&delete_set, b) {
1506                continue;
1507            }
1508            if is_member(&squeeze_set, b) {
1509                if last_squeezed == b as u16 {
1510                    continue;
1511                }
1512                last_squeezed = b as u16;
1513            } else {
1514                last_squeezed = 256;
1515            }
1516            unsafe {
1517                *outbuf.get_unchecked_mut(out_pos) = b;
1518            }
1519            out_pos += 1;
1520        }
1521        writer.write_all(&outbuf[..out_pos])?;
1522    }
1523    Ok(())
1524}
1525
1526/// Squeeze from mmap'd byte slice.
1527/// Uses a two-pass approach: find runs of squeezable bytes with memchr,
1528/// then copy non-squeezed content in bulk.
1529pub fn squeeze_mmap(squeeze_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1530    // Fast path: single squeeze character — use SIMD memchr to find runs
1531    if squeeze_chars.len() == 1 {
1532        return squeeze_single_mmap(squeeze_chars[0], data, writer);
1533    }
1534
1535    // Fast path: 2-3 squeeze chars — use memchr2/memchr3 for SIMD scanning
1536    if squeeze_chars.len() == 2 {
1537        return squeeze_multi_mmap::<2>(squeeze_chars, data, writer);
1538    }
1539    if squeeze_chars.len() == 3 {
1540        return squeeze_multi_mmap::<3>(squeeze_chars, data, writer);
1541    }
1542
1543    // General path: chunked output buffer with member check
1544    let member = build_member_set(squeeze_chars);
1545    let buf_size = data.len().min(BUF_SIZE);
1546    let mut outbuf = vec![0u8; buf_size];
1547    let mut last_squeezed: u16 = 256;
1548
1549    for chunk in data.chunks(buf_size) {
1550        let len = chunk.len();
1551        let mut wp = 0;
1552        let mut i = 0;
1553
1554        unsafe {
1555            let inp = chunk.as_ptr();
1556            let outp = outbuf.as_mut_ptr();
1557
1558            while i < len {
1559                let b = *inp.add(i);
1560                if is_member(&member, b) {
1561                    if last_squeezed != b as u16 {
1562                        *outp.add(wp) = b;
1563                        wp += 1;
1564                        last_squeezed = b as u16;
1565                    }
1566                    i += 1;
1567                    // Skip consecutive duplicates
1568                    while i < len && *inp.add(i) == b {
1569                        i += 1;
1570                    }
1571                } else {
1572                    last_squeezed = 256;
1573                    *outp.add(wp) = b;
1574                    wp += 1;
1575                    i += 1;
1576                }
1577            }
1578        }
1579        writer.write_all(&outbuf[..wp])?;
1580    }
1581    Ok(())
1582}
1583
1584/// Squeeze with 2-3 char sets using SIMD memchr2/memchr3 for fast scanning.
1585/// Batched: copies into output buffer, single write_all per buffer fill.
1586fn squeeze_multi_mmap<const N: usize>(
1587    chars: &[u8],
1588    data: &[u8],
1589    writer: &mut impl Write,
1590) -> io::Result<()> {
1591    let buf_size = data.len().min(BUF_SIZE);
1592    let mut outbuf = vec![0u8; buf_size];
1593    let mut wp = 0;
1594    let mut last_squeezed: u16 = 256;
1595    let mut cursor = 0;
1596
1597    macro_rules! find_next {
1598        ($data:expr) => {
1599            if N == 2 {
1600                memchr::memchr2(chars[0], chars[1], $data)
1601            } else {
1602                memchr::memchr3(chars[0], chars[1], chars[2], $data)
1603            }
1604        };
1605    }
1606
1607    macro_rules! flush_and_copy {
1608        ($src:expr, $len:expr) => {
1609            if wp + $len > buf_size {
1610                writer.write_all(&outbuf[..wp])?;
1611                wp = 0;
1612            }
1613            if $len > buf_size {
1614                writer.write_all($src)?;
1615            } else {
1616                outbuf[wp..wp + $len].copy_from_slice($src);
1617                wp += $len;
1618            }
1619        };
1620    }
1621
1622    while cursor < data.len() {
1623        match find_next!(&data[cursor..]) {
1624            Some(offset) => {
1625                let pos = cursor + offset;
1626                let b = data[pos];
1627                // Copy non-member span + first squeeze char to output buffer
1628                if pos > cursor {
1629                    let span = pos - cursor;
1630                    flush_and_copy!(&data[cursor..pos], span);
1631                    last_squeezed = 256;
1632                }
1633                if last_squeezed != b as u16 {
1634                    if wp >= buf_size {
1635                        writer.write_all(&outbuf[..wp])?;
1636                        wp = 0;
1637                    }
1638                    outbuf[wp] = b;
1639                    wp += 1;
1640                    last_squeezed = b as u16;
1641                }
1642                // Skip consecutive duplicates of same byte
1643                let mut skip = pos + 1;
1644                while skip < data.len() && data[skip] == b {
1645                    skip += 1;
1646                }
1647                cursor = skip;
1648            }
1649            None => {
1650                let remaining = data.len() - cursor;
1651                flush_and_copy!(&data[cursor..], remaining);
1652                break;
1653            }
1654        }
1655    }
1656    if wp > 0 {
1657        writer.write_all(&outbuf[..wp])?;
1658    }
1659    Ok(())
1660}
1661
1662/// Squeeze a single repeated character from mmap'd data.
1663/// Uses SIMD memchr for fast scanning + bulk copy of non-squeeze regions.
1664fn squeeze_single_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1665    if data.is_empty() {
1666        return Ok(());
1667    }
1668
1669    // Fast path: no consecutive duplicates — zero-copy output
1670    if memchr::memmem::find(data, &[ch, ch]).is_none() {
1671        return writer.write_all(data);
1672    }
1673
1674    // For large data: process into a single output buffer and write once.
1675    // Squeeze has cross-boundary state, so we can't easily parallelize.
1676    // But a single large output buffer + one write() is faster than many small writes.
1677    if data.len() >= PAR_THRESHOLD {
1678        let mut outbuf = Vec::with_capacity(data.len());
1679        let len = data.len();
1680        let mut cursor = 0;
1681        while cursor < len {
1682            match memchr::memchr(ch, &data[cursor..]) {
1683                Some(offset) => {
1684                    let pos = cursor + offset;
1685                    if pos > cursor {
1686                        outbuf.extend_from_slice(&data[cursor..pos]);
1687                    }
1688                    outbuf.push(ch);
1689                    cursor = pos + 1;
1690                    while cursor < len && data[cursor] == ch {
1691                        cursor += 1;
1692                    }
1693                }
1694                None => {
1695                    outbuf.extend_from_slice(&data[cursor..]);
1696                    break;
1697                }
1698            }
1699        }
1700        return writer.write_all(&outbuf);
1701    }
1702
1703    // Small data: chunked output
1704    let buf_size = data.len().min(BUF_SIZE);
1705    let mut outbuf = vec![0u8; buf_size];
1706    let len = data.len();
1707    let mut wp = 0;
1708    let mut cursor = 0;
1709
1710    while cursor < len {
1711        match memchr::memchr(ch, &data[cursor..]) {
1712            Some(offset) => {
1713                let pos = cursor + offset;
1714                let gap = pos - cursor;
1715                if gap > 0 {
1716                    if wp + gap > buf_size {
1717                        writer.write_all(&outbuf[..wp])?;
1718                        wp = 0;
1719                    }
1720                    if gap > buf_size {
1721                        writer.write_all(&data[cursor..pos])?;
1722                    } else {
1723                        outbuf[wp..wp + gap].copy_from_slice(&data[cursor..pos]);
1724                        wp += gap;
1725                    }
1726                }
1727                if wp >= buf_size {
1728                    writer.write_all(&outbuf[..wp])?;
1729                    wp = 0;
1730                }
1731                outbuf[wp] = ch;
1732                wp += 1;
1733                cursor = pos + 1;
1734                while cursor < len && data[cursor] == ch {
1735                    cursor += 1;
1736                }
1737            }
1738            None => {
1739                let remaining = len - cursor;
1740                if remaining > 0 {
1741                    if wp + remaining > buf_size {
1742                        writer.write_all(&outbuf[..wp])?;
1743                        wp = 0;
1744                    }
1745                    if remaining > buf_size {
1746                        writer.write_all(&data[cursor..])?;
1747                    } else {
1748                        outbuf[wp..wp + remaining].copy_from_slice(&data[cursor..]);
1749                        wp += remaining;
1750                    }
1751                }
1752                break;
1753            }
1754        }
1755    }
1756
1757    if wp > 0 {
1758        writer.write_all(&outbuf[..wp])?;
1759    }
1760    Ok(())
1761}