Skip to main content

coreutils_rs/tr/
core.rs

1use std::io::{self, Read, Write};
2
3use rayon::prelude::*;
4
5const BUF_SIZE: usize = 8 * 1024 * 1024; // 8MB — mmap chunk processing
6
7/// Stream buffer: 256KB — process data immediately after each read().
8/// Stays in L2 cache, matches typical kernel pipe buffer size.
9/// Unlike fill_buf (which loops to accumulate 8MB = ~128 syscalls for pipes),
10/// we read once and process immediately, matching GNU tr's approach.
11const STREAM_BUF: usize = 256 * 1024;
12
13/// Minimum size for parallel translation.
14/// Even with fast SIMD, parallelism helps for 100MB benchmark files when split across cores.
15const PARALLEL_TRANSLATE_THRESHOLD: usize = 1024 * 1024;
16
17/// Build a 256-byte lookup table mapping set1[i] -> set2[i].
18#[inline]
19fn build_translate_table(set1: &[u8], set2: &[u8]) -> [u8; 256] {
20    let mut table: [u8; 256] = std::array::from_fn(|i| i as u8);
21    let last = set2.last().copied();
22    for (i, &from) in set1.iter().enumerate() {
23        table[from as usize] = if i < set2.len() {
24            set2[i]
25        } else {
26            last.unwrap_or(from)
27        };
28    }
29    table
30}
31
32/// Build a 256-bit (32-byte) membership set for O(1) byte lookup.
33#[inline]
34fn build_member_set(chars: &[u8]) -> [u8; 32] {
35    let mut set = [0u8; 32];
36    for &ch in chars {
37        set[ch as usize >> 3] |= 1 << (ch & 7);
38    }
39    set
40}
41
42#[inline(always)]
43fn is_member(set: &[u8; 32], ch: u8) -> bool {
44    unsafe { (*set.get_unchecked(ch as usize >> 3) & (1 << (ch & 7))) != 0 }
45}
46
47// ============================================================================
48// Table analysis for SIMD fast paths
49// ============================================================================
50
51/// Classification of a translation table for SIMD optimization.
52#[allow(dead_code)] // Fields used only on x86_64 (AVX2 SIMD path)
53enum TranslateKind {
54    /// Table is identity — no translation needed.
55    Identity,
56    /// A contiguous range [lo, hi] with constant wrapping-add delta; identity elsewhere.
57    RangeDelta { lo: u8, hi: u8, delta: u8 },
58    /// Arbitrary translation — use general lookup table.
59    General,
60}
61
62/// Analyze a translation table to detect SIMD-optimizable patterns.
63fn analyze_table(table: &[u8; 256]) -> TranslateKind {
64    let mut first_delta: Option<u8> = None;
65    let mut lo: u8 = 0;
66    let mut hi: u8 = 0;
67    let mut count: u32 = 0;
68
69    for i in 0..256u16 {
70        let actual = table[i as usize];
71        if actual != i as u8 {
72            let delta = actual.wrapping_sub(i as u8);
73            count += 1;
74            match first_delta {
75                None => {
76                    first_delta = Some(delta);
77                    lo = i as u8;
78                    hi = i as u8;
79                }
80                Some(d) if d == delta => {
81                    hi = i as u8;
82                }
83                _ => return TranslateKind::General,
84            }
85        }
86    }
87
88    match (count, first_delta) {
89        (0, _) => TranslateKind::Identity,
90        (c, Some(delta)) if c == (hi as u32 - lo as u32 + 1) => {
91            TranslateKind::RangeDelta { lo, hi, delta }
92        }
93        _ => TranslateKind::General,
94    }
95}
96
97// ============================================================================
98// SIMD translation (x86_64 AVX2)
99// ============================================================================
100
101#[cfg(target_arch = "x86_64")]
102mod simd_tr {
103    /// Translate bytes in range [lo, hi] by adding `delta` (wrapping), leave others unchanged.
104    /// Processes 128 bytes per iteration (4x unroll) using AVX2.
105    ///
106    /// SAFETY: Caller must ensure AVX2 is available and out.len() >= data.len().
107    #[target_feature(enable = "avx2")]
108    pub unsafe fn range_delta(data: &[u8], out: &mut [u8], lo: u8, hi: u8, delta: u8) {
109        unsafe {
110            use std::arch::x86_64::*;
111
112            let lo_vec = _mm256_set1_epi8(lo as i8);
113            let range_vec = _mm256_set1_epi8((hi - lo) as i8);
114            let delta_vec = _mm256_set1_epi8(delta as i8);
115
116            let len = data.len();
117            let inp = data.as_ptr();
118            let outp = out.as_mut_ptr();
119            let mut i = 0usize;
120
121            // 4x unrolled: process 128 bytes per iteration for better ILP
122            while i + 128 <= len {
123                let v0 = _mm256_loadu_si256(inp.add(i) as *const __m256i);
124                let v1 = _mm256_loadu_si256(inp.add(i + 32) as *const __m256i);
125                let v2 = _mm256_loadu_si256(inp.add(i + 64) as *const __m256i);
126                let v3 = _mm256_loadu_si256(inp.add(i + 96) as *const __m256i);
127
128                let d0 = _mm256_sub_epi8(v0, lo_vec);
129                let d1 = _mm256_sub_epi8(v1, lo_vec);
130                let d2 = _mm256_sub_epi8(v2, lo_vec);
131                let d3 = _mm256_sub_epi8(v3, lo_vec);
132
133                let m0 = _mm256_cmpeq_epi8(_mm256_min_epu8(d0, range_vec), d0);
134                let m1 = _mm256_cmpeq_epi8(_mm256_min_epu8(d1, range_vec), d1);
135                let m2 = _mm256_cmpeq_epi8(_mm256_min_epu8(d2, range_vec), d2);
136                let m3 = _mm256_cmpeq_epi8(_mm256_min_epu8(d3, range_vec), d3);
137
138                let r0 = _mm256_add_epi8(v0, _mm256_and_si256(m0, delta_vec));
139                let r1 = _mm256_add_epi8(v1, _mm256_and_si256(m1, delta_vec));
140                let r2 = _mm256_add_epi8(v2, _mm256_and_si256(m2, delta_vec));
141                let r3 = _mm256_add_epi8(v3, _mm256_and_si256(m3, delta_vec));
142
143                _mm256_storeu_si256(outp.add(i) as *mut __m256i, r0);
144                _mm256_storeu_si256(outp.add(i + 32) as *mut __m256i, r1);
145                _mm256_storeu_si256(outp.add(i + 64) as *mut __m256i, r2);
146                _mm256_storeu_si256(outp.add(i + 96) as *mut __m256i, r3);
147                i += 128;
148            }
149
150            while i + 32 <= len {
151                let v = _mm256_loadu_si256(inp.add(i) as *const __m256i);
152                let diff = _mm256_sub_epi8(v, lo_vec);
153                let mask = _mm256_cmpeq_epi8(_mm256_min_epu8(diff, range_vec), diff);
154                let result = _mm256_add_epi8(v, _mm256_and_si256(mask, delta_vec));
155                _mm256_storeu_si256(outp.add(i) as *mut __m256i, result);
156                i += 32;
157            }
158
159            while i < len {
160                let b = *inp.add(i);
161                *outp.add(i) = if b.wrapping_sub(lo) <= (hi - lo) {
162                    b.wrapping_add(delta)
163                } else {
164                    b
165                };
166                i += 1;
167            }
168        }
169    }
170
171    /// In-place SIMD translate for stdin path.
172    ///
173    /// SAFETY: Caller must ensure AVX2 is available.
174    #[target_feature(enable = "avx2")]
175    pub unsafe fn range_delta_inplace(data: &mut [u8], lo: u8, hi: u8, delta: u8) {
176        unsafe {
177            use std::arch::x86_64::*;
178
179            let lo_vec = _mm256_set1_epi8(lo as i8);
180            let range_vec = _mm256_set1_epi8((hi - lo) as i8);
181            let delta_vec = _mm256_set1_epi8(delta as i8);
182
183            let len = data.len();
184            let ptr = data.as_mut_ptr();
185            let mut i = 0usize;
186
187            while i + 128 <= len {
188                let v0 = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
189                let v1 = _mm256_loadu_si256(ptr.add(i + 32) as *const __m256i);
190                let v2 = _mm256_loadu_si256(ptr.add(i + 64) as *const __m256i);
191                let v3 = _mm256_loadu_si256(ptr.add(i + 96) as *const __m256i);
192
193                let d0 = _mm256_sub_epi8(v0, lo_vec);
194                let d1 = _mm256_sub_epi8(v1, lo_vec);
195                let d2 = _mm256_sub_epi8(v2, lo_vec);
196                let d3 = _mm256_sub_epi8(v3, lo_vec);
197
198                let m0 = _mm256_cmpeq_epi8(_mm256_min_epu8(d0, range_vec), d0);
199                let m1 = _mm256_cmpeq_epi8(_mm256_min_epu8(d1, range_vec), d1);
200                let m2 = _mm256_cmpeq_epi8(_mm256_min_epu8(d2, range_vec), d2);
201                let m3 = _mm256_cmpeq_epi8(_mm256_min_epu8(d3, range_vec), d3);
202
203                let r0 = _mm256_add_epi8(v0, _mm256_and_si256(m0, delta_vec));
204                let r1 = _mm256_add_epi8(v1, _mm256_and_si256(m1, delta_vec));
205                let r2 = _mm256_add_epi8(v2, _mm256_and_si256(m2, delta_vec));
206                let r3 = _mm256_add_epi8(v3, _mm256_and_si256(m3, delta_vec));
207
208                _mm256_storeu_si256(ptr.add(i) as *mut __m256i, r0);
209                _mm256_storeu_si256(ptr.add(i + 32) as *mut __m256i, r1);
210                _mm256_storeu_si256(ptr.add(i + 64) as *mut __m256i, r2);
211                _mm256_storeu_si256(ptr.add(i + 96) as *mut __m256i, r3);
212                i += 128;
213            }
214
215            while i + 32 <= len {
216                let v = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
217                let diff = _mm256_sub_epi8(v, lo_vec);
218                let mask = _mm256_cmpeq_epi8(_mm256_min_epu8(diff, range_vec), diff);
219                let result = _mm256_add_epi8(v, _mm256_and_si256(mask, delta_vec));
220                _mm256_storeu_si256(ptr.add(i) as *mut __m256i, result);
221                i += 32;
222            }
223
224            while i < len {
225                let b = *ptr.add(i);
226                *ptr.add(i) = if b.wrapping_sub(lo) <= (hi - lo) {
227                    b.wrapping_add(delta)
228                } else {
229                    b
230                };
231                i += 1;
232            }
233        }
234    }
235}
236
237/// Check if AVX2 is available at runtime.
238#[cfg(target_arch = "x86_64")]
239#[inline(always)]
240fn has_avx2() -> bool {
241    is_x86_feature_detected!("avx2")
242}
243
244/// Translate a chunk of bytes using a lookup table — unrolled 8-byte inner loop.
245#[inline(always)]
246fn translate_chunk(chunk: &[u8], out: &mut [u8], table: &[u8; 256]) {
247    let len = chunk.len();
248    let mut i = 0;
249    while i + 8 <= len {
250        unsafe {
251            *out.get_unchecked_mut(i) = *table.get_unchecked(*chunk.get_unchecked(i) as usize);
252            *out.get_unchecked_mut(i + 1) =
253                *table.get_unchecked(*chunk.get_unchecked(i + 1) as usize);
254            *out.get_unchecked_mut(i + 2) =
255                *table.get_unchecked(*chunk.get_unchecked(i + 2) as usize);
256            *out.get_unchecked_mut(i + 3) =
257                *table.get_unchecked(*chunk.get_unchecked(i + 3) as usize);
258            *out.get_unchecked_mut(i + 4) =
259                *table.get_unchecked(*chunk.get_unchecked(i + 4) as usize);
260            *out.get_unchecked_mut(i + 5) =
261                *table.get_unchecked(*chunk.get_unchecked(i + 5) as usize);
262            *out.get_unchecked_mut(i + 6) =
263                *table.get_unchecked(*chunk.get_unchecked(i + 6) as usize);
264            *out.get_unchecked_mut(i + 7) =
265                *table.get_unchecked(*chunk.get_unchecked(i + 7) as usize);
266        }
267        i += 8;
268    }
269    while i < len {
270        unsafe {
271            *out.get_unchecked_mut(i) = *table.get_unchecked(*chunk.get_unchecked(i) as usize);
272        }
273        i += 1;
274    }
275}
276
277/// In-place translate for stdin path — avoids separate output buffer.
278#[inline(always)]
279fn translate_inplace(data: &mut [u8], table: &[u8; 256]) {
280    let len = data.len();
281    let ptr = data.as_mut_ptr();
282    let tab = table.as_ptr();
283
284    unsafe {
285        let mut i = 0;
286        while i + 8 <= len {
287            *ptr.add(i) = *tab.add(*ptr.add(i) as usize);
288            *ptr.add(i + 1) = *tab.add(*ptr.add(i + 1) as usize);
289            *ptr.add(i + 2) = *tab.add(*ptr.add(i + 2) as usize);
290            *ptr.add(i + 3) = *tab.add(*ptr.add(i + 3) as usize);
291            *ptr.add(i + 4) = *tab.add(*ptr.add(i + 4) as usize);
292            *ptr.add(i + 5) = *tab.add(*ptr.add(i + 5) as usize);
293            *ptr.add(i + 6) = *tab.add(*ptr.add(i + 6) as usize);
294            *ptr.add(i + 7) = *tab.add(*ptr.add(i + 7) as usize);
295            i += 8;
296        }
297        while i < len {
298            *ptr.add(i) = *tab.add(*ptr.add(i) as usize);
299            i += 1;
300        }
301    }
302}
303
304// ============================================================================
305// Dispatch: choose SIMD or scalar based on table analysis
306// ============================================================================
307
308/// Translate a chunk using the best available method.
309/// `use_simd` is cached at call site to avoid per-chunk atomic loads.
310#[inline]
311fn translate_chunk_dispatch(
312    chunk: &[u8],
313    out: &mut [u8],
314    table: &[u8; 256],
315    kind: &TranslateKind,
316    _use_simd: bool,
317) {
318    match kind {
319        TranslateKind::Identity => {
320            out[..chunk.len()].copy_from_slice(chunk);
321        }
322        #[cfg(target_arch = "x86_64")]
323        TranslateKind::RangeDelta { lo, hi, delta } => {
324            if _use_simd {
325                unsafe { simd_tr::range_delta(chunk, out, *lo, *hi, *delta) };
326                return;
327            }
328            translate_chunk(chunk, out, table);
329        }
330        #[cfg(not(target_arch = "x86_64"))]
331        TranslateKind::RangeDelta { .. } => {
332            translate_chunk(chunk, out, table);
333        }
334        TranslateKind::General => {
335            translate_chunk(chunk, out, table);
336        }
337    }
338}
339
340/// In-place translate dispatch.
341/// `use_simd` is cached at call site to avoid per-chunk atomic loads.
342#[inline]
343fn translate_inplace_dispatch(
344    data: &mut [u8],
345    table: &[u8; 256],
346    kind: &TranslateKind,
347    _use_simd: bool,
348) {
349    match kind {
350        TranslateKind::Identity => {}
351        #[cfg(target_arch = "x86_64")]
352        TranslateKind::RangeDelta { lo, hi, delta } => {
353            if _use_simd {
354                unsafe { simd_tr::range_delta_inplace(data, *lo, *hi, *delta) };
355                return;
356            }
357            translate_inplace(data, table);
358        }
359        #[cfg(not(target_arch = "x86_64"))]
360        TranslateKind::RangeDelta { .. } => {
361            translate_inplace(data, table);
362        }
363        TranslateKind::General => {
364            translate_inplace(data, table);
365        }
366    }
367}
368
369// ============================================================================
370// Streaming functions (Read + Write)
371// Process data immediately after each read() — no fill_buf accumulation.
372// Uses 256KB buffer (L2-friendly) instead of 8MB.
373// ============================================================================
374
375pub fn translate(
376    set1: &[u8],
377    set2: &[u8],
378    reader: &mut impl Read,
379    writer: &mut impl Write,
380) -> io::Result<()> {
381    let table = build_translate_table(set1, set2);
382    let kind = analyze_table(&table);
383    #[cfg(target_arch = "x86_64")]
384    let use_simd = has_avx2();
385    #[cfg(not(target_arch = "x86_64"))]
386    let use_simd = false;
387
388    let mut buf = vec![0u8; STREAM_BUF];
389    loop {
390        let n = match reader.read(&mut buf) {
391            Ok(0) => break,
392            Ok(n) => n,
393            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
394            Err(e) => return Err(e),
395        };
396        translate_inplace_dispatch(&mut buf[..n], &table, &kind, use_simd);
397        writer.write_all(&buf[..n])?;
398    }
399    Ok(())
400}
401
402pub fn translate_squeeze(
403    set1: &[u8],
404    set2: &[u8],
405    reader: &mut impl Read,
406    writer: &mut impl Write,
407) -> io::Result<()> {
408    let table = build_translate_table(set1, set2);
409    let squeeze_set = build_member_set(set2);
410    let mut outbuf = vec![0u8; STREAM_BUF];
411    let mut inbuf = vec![0u8; STREAM_BUF];
412    let mut last_squeezed: u16 = 256;
413
414    loop {
415        let n = match reader.read(&mut inbuf) {
416            Ok(0) => break,
417            Ok(n) => n,
418            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
419            Err(e) => return Err(e),
420        };
421        let mut out_pos = 0;
422        for &b in &inbuf[..n] {
423            let translated = unsafe { *table.get_unchecked(b as usize) };
424            if is_member(&squeeze_set, translated) {
425                if last_squeezed == translated as u16 {
426                    continue;
427                }
428                last_squeezed = translated as u16;
429            } else {
430                last_squeezed = 256;
431            }
432            unsafe {
433                *outbuf.get_unchecked_mut(out_pos) = translated;
434            }
435            out_pos += 1;
436        }
437        writer.write_all(&outbuf[..out_pos])?;
438    }
439    Ok(())
440}
441
442pub fn delete(
443    delete_chars: &[u8],
444    reader: &mut impl Read,
445    writer: &mut impl Write,
446) -> io::Result<()> {
447    // Fast path: single character delete using SIMD memchr
448    if delete_chars.len() == 1 {
449        return delete_single_streaming(delete_chars[0], reader, writer);
450    }
451
452    let member = build_member_set(delete_chars);
453    let mut outbuf = vec![0u8; STREAM_BUF];
454    let mut inbuf = vec![0u8; STREAM_BUF];
455
456    loop {
457        let n = match reader.read(&mut inbuf) {
458            Ok(0) => break,
459            Ok(n) => n,
460            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
461            Err(e) => return Err(e),
462        };
463        let mut out_pos = 0;
464        for &b in &inbuf[..n] {
465            if !is_member(&member, b) {
466                unsafe {
467                    *outbuf.get_unchecked_mut(out_pos) = b;
468                }
469                out_pos += 1;
470            }
471        }
472        writer.write_all(&outbuf[..out_pos])?;
473    }
474    Ok(())
475}
476
477/// Single-character delete from a reader using SIMD memchr scanning.
478fn delete_single_streaming(
479    ch: u8,
480    reader: &mut impl Read,
481    writer: &mut impl Write,
482) -> io::Result<()> {
483    let mut buf = vec![0u8; STREAM_BUF];
484    loop {
485        let n = match reader.read(&mut buf) {
486            Ok(0) => break,
487            Ok(n) => n,
488            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
489            Err(e) => return Err(e),
490        };
491        let chunk = &buf[..n];
492        let mut last = 0;
493        for pos in memchr::memchr_iter(ch, chunk) {
494            if pos > last {
495                writer.write_all(&chunk[last..pos])?;
496            }
497            last = pos + 1;
498        }
499        if last < n {
500            writer.write_all(&chunk[last..n])?;
501        }
502    }
503    Ok(())
504}
505
506pub fn delete_squeeze(
507    delete_chars: &[u8],
508    squeeze_chars: &[u8],
509    reader: &mut impl Read,
510    writer: &mut impl Write,
511) -> io::Result<()> {
512    let delete_set = build_member_set(delete_chars);
513    let squeeze_set = build_member_set(squeeze_chars);
514    let mut outbuf = vec![0u8; STREAM_BUF];
515    let mut inbuf = vec![0u8; STREAM_BUF];
516    let mut last_squeezed: u16 = 256;
517
518    loop {
519        let n = match reader.read(&mut inbuf) {
520            Ok(0) => break,
521            Ok(n) => n,
522            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
523            Err(e) => return Err(e),
524        };
525        let mut out_pos = 0;
526        for &b in &inbuf[..n] {
527            if is_member(&delete_set, b) {
528                continue;
529            }
530            if is_member(&squeeze_set, b) {
531                if last_squeezed == b as u16 {
532                    continue;
533                }
534                last_squeezed = b as u16;
535            } else {
536                last_squeezed = 256;
537            }
538            unsafe {
539                *outbuf.get_unchecked_mut(out_pos) = b;
540            }
541            out_pos += 1;
542        }
543        writer.write_all(&outbuf[..out_pos])?;
544    }
545    Ok(())
546}
547
548pub fn squeeze(
549    squeeze_chars: &[u8],
550    reader: &mut impl Read,
551    writer: &mut impl Write,
552) -> io::Result<()> {
553    let member = build_member_set(squeeze_chars);
554    let mut outbuf = vec![0u8; STREAM_BUF];
555    let mut inbuf = vec![0u8; STREAM_BUF];
556    let mut last_squeezed: u16 = 256;
557
558    loop {
559        let n = match reader.read(&mut inbuf) {
560            Ok(0) => break,
561            Ok(n) => n,
562            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
563            Err(e) => return Err(e),
564        };
565        let mut out_pos = 0;
566        for &b in &inbuf[..n] {
567            if is_member(&member, b) {
568                if last_squeezed == b as u16 {
569                    continue;
570                }
571                last_squeezed = b as u16;
572            } else {
573                last_squeezed = 256;
574            }
575            unsafe {
576                *outbuf.get_unchecked_mut(out_pos) = b;
577            }
578            out_pos += 1;
579        }
580        writer.write_all(&outbuf[..out_pos])?;
581    }
582    Ok(())
583}
584
585// ============================================================================
586// Mmap-based functions (zero-copy input from byte slice)
587// ============================================================================
588
589/// Translate bytes from an mmap'd byte slice — zero syscall reads.
590/// Uses SIMD AVX2 for range-delta patterns (e.g., a-z → A-Z).
591/// For large inputs, translates in parallel using rayon for maximum throughput.
592pub fn translate_mmap(
593    set1: &[u8],
594    set2: &[u8],
595    data: &[u8],
596    writer: &mut impl Write,
597) -> io::Result<()> {
598    let table = build_translate_table(set1, set2);
599    let kind = analyze_table(&table);
600    #[cfg(target_arch = "x86_64")]
601    let use_simd = has_avx2();
602    #[cfg(not(target_arch = "x86_64"))]
603    let use_simd = false;
604
605    if matches!(kind, TranslateKind::Identity) {
606        return writer.write_all(data);
607    }
608
609    // Parallel translation for large inputs — each chunk is independent
610    if data.len() >= PARALLEL_TRANSLATE_THRESHOLD {
611        let num_threads = rayon::current_num_threads().max(1);
612        let chunk_size = (data.len() + num_threads - 1) / num_threads;
613        // Align to BUF_SIZE boundaries for cache efficiency
614        let chunk_size = ((chunk_size + BUF_SIZE - 1) / BUF_SIZE) * BUF_SIZE;
615
616        // Translate all chunks in parallel
617        let translated: Vec<Vec<u8>> = data
618            .par_chunks(chunk_size)
619            .map(|chunk| {
620                let mut out = vec![0u8; chunk.len()];
621                translate_chunk_dispatch(chunk, &mut out, &table, &kind, use_simd);
622                out
623            })
624            .collect();
625
626        // Write sequentially to preserve order
627        for chunk in &translated {
628            writer.write_all(chunk)?;
629        }
630        return Ok(());
631    }
632
633    // Sequential path for smaller data
634    let mut out = vec![0u8; BUF_SIZE];
635    for chunk in data.chunks(BUF_SIZE) {
636        translate_chunk_dispatch(chunk, &mut out[..chunk.len()], &table, &kind, use_simd);
637        writer.write_all(&out[..chunk.len()])?;
638    }
639    Ok(())
640}
641
642/// Translate + squeeze from mmap'd byte slice.
643pub fn translate_squeeze_mmap(
644    set1: &[u8],
645    set2: &[u8],
646    data: &[u8],
647    writer: &mut impl Write,
648) -> io::Result<()> {
649    let table = build_translate_table(set1, set2);
650    let squeeze_set = build_member_set(set2);
651    let mut outbuf = vec![0u8; BUF_SIZE];
652    let mut last_squeezed: u16 = 256;
653
654    for chunk in data.chunks(BUF_SIZE) {
655        let mut out_pos = 0;
656        for &b in chunk {
657            let translated = unsafe { *table.get_unchecked(b as usize) };
658            if is_member(&squeeze_set, translated) {
659                if last_squeezed == translated as u16 {
660                    continue;
661                }
662                last_squeezed = translated as u16;
663            } else {
664                last_squeezed = 256;
665            }
666            unsafe {
667                *outbuf.get_unchecked_mut(out_pos) = translated;
668            }
669            out_pos += 1;
670        }
671        writer.write_all(&outbuf[..out_pos])?;
672    }
673    Ok(())
674}
675
676/// Delete from mmap'd byte slice.
677/// Uses SIMD memchr for single-character delete (common case).
678pub fn delete_mmap(delete_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
679    // Fast path: single character delete uses SIMD memchr
680    if delete_chars.len() == 1 {
681        return delete_single_char_mmap(delete_chars[0], data, writer);
682    }
683
684    let member = build_member_set(delete_chars);
685    let mut outbuf = vec![0u8; BUF_SIZE];
686
687    for chunk in data.chunks(BUF_SIZE) {
688        let mut out_pos = 0;
689        for &b in chunk {
690            if !is_member(&member, b) {
691                unsafe {
692                    *outbuf.get_unchecked_mut(out_pos) = b;
693                }
694                out_pos += 1;
695            }
696        }
697        writer.write_all(&outbuf[..out_pos])?;
698    }
699    Ok(())
700}
701
702/// Single-character delete from mmap using SIMD memchr.
703/// Copies runs of non-matching bytes in bulk (memcpy), far faster than byte-at-a-time.
704fn delete_single_char_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
705    let mut last = 0;
706    for pos in memchr::memchr_iter(ch, data) {
707        if pos > last {
708            writer.write_all(&data[last..pos])?;
709        }
710        last = pos + 1;
711    }
712    if last < data.len() {
713        writer.write_all(&data[last..])?;
714    }
715    Ok(())
716}
717
718/// Delete + squeeze from mmap'd byte slice.
719pub fn delete_squeeze_mmap(
720    delete_chars: &[u8],
721    squeeze_chars: &[u8],
722    data: &[u8],
723    writer: &mut impl Write,
724) -> io::Result<()> {
725    let delete_set = build_member_set(delete_chars);
726    let squeeze_set = build_member_set(squeeze_chars);
727    let mut outbuf = vec![0u8; BUF_SIZE];
728    let mut last_squeezed: u16 = 256;
729
730    for chunk in data.chunks(BUF_SIZE) {
731        let mut out_pos = 0;
732        for &b in chunk {
733            if is_member(&delete_set, b) {
734                continue;
735            }
736            if is_member(&squeeze_set, b) {
737                if last_squeezed == b as u16 {
738                    continue;
739                }
740                last_squeezed = b as u16;
741            } else {
742                last_squeezed = 256;
743            }
744            unsafe {
745                *outbuf.get_unchecked_mut(out_pos) = b;
746            }
747            out_pos += 1;
748        }
749        writer.write_all(&outbuf[..out_pos])?;
750    }
751    Ok(())
752}
753
754/// Squeeze from mmap'd byte slice.
755pub fn squeeze_mmap(squeeze_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
756    let member = build_member_set(squeeze_chars);
757    let mut outbuf = vec![0u8; BUF_SIZE];
758    let mut last_squeezed: u16 = 256;
759
760    for chunk in data.chunks(BUF_SIZE) {
761        let mut out_pos = 0;
762        for &b in chunk {
763            if is_member(&member, b) {
764                if last_squeezed == b as u16 {
765                    continue;
766                }
767                last_squeezed = b as u16;
768            } else {
769                last_squeezed = 256;
770            }
771            unsafe {
772                *outbuf.get_unchecked_mut(out_pos) = b;
773            }
774            out_pos += 1;
775        }
776        writer.write_all(&outbuf[..out_pos])?;
777    }
778    Ok(())
779}