Skip to main content

coreutils_rs/tr/
core.rs

1use std::io::{self, Read, Write};
2
3use rayon::prelude::*;
4
5const BUF_SIZE: usize = 8 * 1024 * 1024; // 8MB — reduces syscall overhead
6
7/// Minimum size for parallel translation.
8/// AVX2 SIMD translation is so fast (~16 GB/s) that parallelism only helps for very large inputs.
9const PARALLEL_TRANSLATE_THRESHOLD: usize = 256 * 1024 * 1024;
10
11/// Build a 256-byte lookup table mapping set1[i] -> set2[i].
12#[inline]
13fn build_translate_table(set1: &[u8], set2: &[u8]) -> [u8; 256] {
14    let mut table: [u8; 256] = std::array::from_fn(|i| i as u8);
15    let last = set2.last().copied();
16    for (i, &from) in set1.iter().enumerate() {
17        table[from as usize] = if i < set2.len() {
18            set2[i]
19        } else {
20            last.unwrap_or(from)
21        };
22    }
23    table
24}
25
26/// Build a 256-bit (32-byte) membership set for O(1) byte lookup.
27#[inline]
28fn build_member_set(chars: &[u8]) -> [u8; 32] {
29    let mut set = [0u8; 32];
30    for &ch in chars {
31        set[ch as usize >> 3] |= 1 << (ch & 7);
32    }
33    set
34}
35
36#[inline(always)]
37fn is_member(set: &[u8; 32], ch: u8) -> bool {
38    unsafe { (*set.get_unchecked(ch as usize >> 3) & (1 << (ch & 7))) != 0 }
39}
40
41// ============================================================================
42// Table analysis for SIMD fast paths
43// ============================================================================
44
45/// Classification of a translation table for SIMD optimization.
46#[allow(dead_code)] // Fields used only on x86_64 (AVX2 SIMD path)
47enum TranslateKind {
48    /// Table is identity — no translation needed.
49    Identity,
50    /// A contiguous range [lo, hi] with constant wrapping-add delta; identity elsewhere.
51    RangeDelta { lo: u8, hi: u8, delta: u8 },
52    /// Arbitrary translation — use general lookup table.
53    General,
54}
55
56/// Analyze a translation table to detect SIMD-optimizable patterns.
57fn analyze_table(table: &[u8; 256]) -> TranslateKind {
58    let mut first_delta: Option<u8> = None;
59    let mut lo: u8 = 0;
60    let mut hi: u8 = 0;
61    let mut count: u32 = 0;
62
63    for i in 0..256u16 {
64        let actual = table[i as usize];
65        if actual != i as u8 {
66            let delta = actual.wrapping_sub(i as u8);
67            count += 1;
68            match first_delta {
69                None => {
70                    first_delta = Some(delta);
71                    lo = i as u8;
72                    hi = i as u8;
73                }
74                Some(d) if d == delta => {
75                    hi = i as u8;
76                }
77                _ => return TranslateKind::General,
78            }
79        }
80    }
81
82    match (count, first_delta) {
83        (0, _) => TranslateKind::Identity,
84        (c, Some(delta)) if c == (hi as u32 - lo as u32 + 1) => {
85            TranslateKind::RangeDelta { lo, hi, delta }
86        }
87        _ => TranslateKind::General,
88    }
89}
90
91// ============================================================================
92// SIMD translation (x86_64 AVX2)
93// ============================================================================
94
95#[cfg(target_arch = "x86_64")]
96mod simd_tr {
97    /// Translate bytes in range [lo, hi] by adding `delta` (wrapping), leave others unchanged.
98    /// Processes 128 bytes per iteration (4x unroll) using AVX2.
99    ///
100    /// SAFETY: Caller must ensure AVX2 is available and out.len() >= data.len().
101    #[target_feature(enable = "avx2")]
102    pub unsafe fn range_delta(data: &[u8], out: &mut [u8], lo: u8, hi: u8, delta: u8) {
103        unsafe {
104            use std::arch::x86_64::*;
105
106            let lo_vec = _mm256_set1_epi8(lo as i8);
107            let range_vec = _mm256_set1_epi8((hi - lo) as i8);
108            let delta_vec = _mm256_set1_epi8(delta as i8);
109
110            let len = data.len();
111            let inp = data.as_ptr();
112            let outp = out.as_mut_ptr();
113            let mut i = 0usize;
114
115            // 4x unrolled: process 128 bytes per iteration for better ILP
116            while i + 128 <= len {
117                let v0 = _mm256_loadu_si256(inp.add(i) as *const __m256i);
118                let v1 = _mm256_loadu_si256(inp.add(i + 32) as *const __m256i);
119                let v2 = _mm256_loadu_si256(inp.add(i + 64) as *const __m256i);
120                let v3 = _mm256_loadu_si256(inp.add(i + 96) as *const __m256i);
121
122                let d0 = _mm256_sub_epi8(v0, lo_vec);
123                let d1 = _mm256_sub_epi8(v1, lo_vec);
124                let d2 = _mm256_sub_epi8(v2, lo_vec);
125                let d3 = _mm256_sub_epi8(v3, lo_vec);
126
127                let m0 = _mm256_cmpeq_epi8(_mm256_min_epu8(d0, range_vec), d0);
128                let m1 = _mm256_cmpeq_epi8(_mm256_min_epu8(d1, range_vec), d1);
129                let m2 = _mm256_cmpeq_epi8(_mm256_min_epu8(d2, range_vec), d2);
130                let m3 = _mm256_cmpeq_epi8(_mm256_min_epu8(d3, range_vec), d3);
131
132                let r0 = _mm256_add_epi8(v0, _mm256_and_si256(m0, delta_vec));
133                let r1 = _mm256_add_epi8(v1, _mm256_and_si256(m1, delta_vec));
134                let r2 = _mm256_add_epi8(v2, _mm256_and_si256(m2, delta_vec));
135                let r3 = _mm256_add_epi8(v3, _mm256_and_si256(m3, delta_vec));
136
137                _mm256_storeu_si256(outp.add(i) as *mut __m256i, r0);
138                _mm256_storeu_si256(outp.add(i + 32) as *mut __m256i, r1);
139                _mm256_storeu_si256(outp.add(i + 64) as *mut __m256i, r2);
140                _mm256_storeu_si256(outp.add(i + 96) as *mut __m256i, r3);
141                i += 128;
142            }
143
144            while i + 32 <= len {
145                let v = _mm256_loadu_si256(inp.add(i) as *const __m256i);
146                let diff = _mm256_sub_epi8(v, lo_vec);
147                let mask = _mm256_cmpeq_epi8(_mm256_min_epu8(diff, range_vec), diff);
148                let result = _mm256_add_epi8(v, _mm256_and_si256(mask, delta_vec));
149                _mm256_storeu_si256(outp.add(i) as *mut __m256i, result);
150                i += 32;
151            }
152
153            while i < len {
154                let b = *inp.add(i);
155                *outp.add(i) = if b.wrapping_sub(lo) <= (hi - lo) {
156                    b.wrapping_add(delta)
157                } else {
158                    b
159                };
160                i += 1;
161            }
162        }
163    }
164
165    /// In-place SIMD translate for stdin path.
166    ///
167    /// SAFETY: Caller must ensure AVX2 is available.
168    #[target_feature(enable = "avx2")]
169    pub unsafe fn range_delta_inplace(data: &mut [u8], lo: u8, hi: u8, delta: u8) {
170        unsafe {
171            use std::arch::x86_64::*;
172
173            let lo_vec = _mm256_set1_epi8(lo as i8);
174            let range_vec = _mm256_set1_epi8((hi - lo) as i8);
175            let delta_vec = _mm256_set1_epi8(delta as i8);
176
177            let len = data.len();
178            let ptr = data.as_mut_ptr();
179            let mut i = 0usize;
180
181            while i + 128 <= len {
182                let v0 = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
183                let v1 = _mm256_loadu_si256(ptr.add(i + 32) as *const __m256i);
184                let v2 = _mm256_loadu_si256(ptr.add(i + 64) as *const __m256i);
185                let v3 = _mm256_loadu_si256(ptr.add(i + 96) as *const __m256i);
186
187                let d0 = _mm256_sub_epi8(v0, lo_vec);
188                let d1 = _mm256_sub_epi8(v1, lo_vec);
189                let d2 = _mm256_sub_epi8(v2, lo_vec);
190                let d3 = _mm256_sub_epi8(v3, lo_vec);
191
192                let m0 = _mm256_cmpeq_epi8(_mm256_min_epu8(d0, range_vec), d0);
193                let m1 = _mm256_cmpeq_epi8(_mm256_min_epu8(d1, range_vec), d1);
194                let m2 = _mm256_cmpeq_epi8(_mm256_min_epu8(d2, range_vec), d2);
195                let m3 = _mm256_cmpeq_epi8(_mm256_min_epu8(d3, range_vec), d3);
196
197                let r0 = _mm256_add_epi8(v0, _mm256_and_si256(m0, delta_vec));
198                let r1 = _mm256_add_epi8(v1, _mm256_and_si256(m1, delta_vec));
199                let r2 = _mm256_add_epi8(v2, _mm256_and_si256(m2, delta_vec));
200                let r3 = _mm256_add_epi8(v3, _mm256_and_si256(m3, delta_vec));
201
202                _mm256_storeu_si256(ptr.add(i) as *mut __m256i, r0);
203                _mm256_storeu_si256(ptr.add(i + 32) as *mut __m256i, r1);
204                _mm256_storeu_si256(ptr.add(i + 64) as *mut __m256i, r2);
205                _mm256_storeu_si256(ptr.add(i + 96) as *mut __m256i, r3);
206                i += 128;
207            }
208
209            while i + 32 <= len {
210                let v = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
211                let diff = _mm256_sub_epi8(v, lo_vec);
212                let mask = _mm256_cmpeq_epi8(_mm256_min_epu8(diff, range_vec), diff);
213                let result = _mm256_add_epi8(v, _mm256_and_si256(mask, delta_vec));
214                _mm256_storeu_si256(ptr.add(i) as *mut __m256i, result);
215                i += 32;
216            }
217
218            while i < len {
219                let b = *ptr.add(i);
220                *ptr.add(i) = if b.wrapping_sub(lo) <= (hi - lo) {
221                    b.wrapping_add(delta)
222                } else {
223                    b
224                };
225                i += 1;
226            }
227        }
228    }
229}
230
231/// Check if AVX2 is available at runtime.
232#[cfg(target_arch = "x86_64")]
233#[inline(always)]
234fn has_avx2() -> bool {
235    is_x86_feature_detected!("avx2")
236}
237
238/// Fill buffer completely from reader (handles short reads from pipes).
239fn fill_buf(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
240    let mut filled = 0;
241    while filled < buf.len() {
242        match reader.read(&mut buf[filled..]) {
243            Ok(0) => break,
244            Ok(n) => filled += n,
245            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
246            Err(e) => return Err(e),
247        }
248    }
249    Ok(filled)
250}
251
252/// Translate a chunk of bytes using a lookup table — unrolled 8-byte inner loop.
253#[inline(always)]
254fn translate_chunk(chunk: &[u8], out: &mut [u8], table: &[u8; 256]) {
255    let len = chunk.len();
256    let mut i = 0;
257    while i + 8 <= len {
258        unsafe {
259            *out.get_unchecked_mut(i) = *table.get_unchecked(*chunk.get_unchecked(i) as usize);
260            *out.get_unchecked_mut(i + 1) =
261                *table.get_unchecked(*chunk.get_unchecked(i + 1) as usize);
262            *out.get_unchecked_mut(i + 2) =
263                *table.get_unchecked(*chunk.get_unchecked(i + 2) as usize);
264            *out.get_unchecked_mut(i + 3) =
265                *table.get_unchecked(*chunk.get_unchecked(i + 3) as usize);
266            *out.get_unchecked_mut(i + 4) =
267                *table.get_unchecked(*chunk.get_unchecked(i + 4) as usize);
268            *out.get_unchecked_mut(i + 5) =
269                *table.get_unchecked(*chunk.get_unchecked(i + 5) as usize);
270            *out.get_unchecked_mut(i + 6) =
271                *table.get_unchecked(*chunk.get_unchecked(i + 6) as usize);
272            *out.get_unchecked_mut(i + 7) =
273                *table.get_unchecked(*chunk.get_unchecked(i + 7) as usize);
274        }
275        i += 8;
276    }
277    while i < len {
278        unsafe {
279            *out.get_unchecked_mut(i) = *table.get_unchecked(*chunk.get_unchecked(i) as usize);
280        }
281        i += 1;
282    }
283}
284
285/// In-place translate for stdin path — avoids separate output buffer.
286#[inline(always)]
287fn translate_inplace(data: &mut [u8], table: &[u8; 256]) {
288    let len = data.len();
289    let ptr = data.as_mut_ptr();
290    let tab = table.as_ptr();
291
292    unsafe {
293        let mut i = 0;
294        while i + 8 <= len {
295            *ptr.add(i) = *tab.add(*ptr.add(i) as usize);
296            *ptr.add(i + 1) = *tab.add(*ptr.add(i + 1) as usize);
297            *ptr.add(i + 2) = *tab.add(*ptr.add(i + 2) as usize);
298            *ptr.add(i + 3) = *tab.add(*ptr.add(i + 3) as usize);
299            *ptr.add(i + 4) = *tab.add(*ptr.add(i + 4) as usize);
300            *ptr.add(i + 5) = *tab.add(*ptr.add(i + 5) as usize);
301            *ptr.add(i + 6) = *tab.add(*ptr.add(i + 6) as usize);
302            *ptr.add(i + 7) = *tab.add(*ptr.add(i + 7) as usize);
303            i += 8;
304        }
305        while i < len {
306            *ptr.add(i) = *tab.add(*ptr.add(i) as usize);
307            i += 1;
308        }
309    }
310}
311
312// ============================================================================
313// Dispatch: choose SIMD or scalar based on table analysis
314// ============================================================================
315
316/// Translate a chunk using the best available method.
317#[inline]
318fn translate_chunk_dispatch(chunk: &[u8], out: &mut [u8], table: &[u8; 256], kind: &TranslateKind) {
319    match kind {
320        TranslateKind::Identity => {
321            out[..chunk.len()].copy_from_slice(chunk);
322        }
323        #[cfg(target_arch = "x86_64")]
324        TranslateKind::RangeDelta { lo, hi, delta } => {
325            if has_avx2() {
326                unsafe { simd_tr::range_delta(chunk, out, *lo, *hi, *delta) };
327                return;
328            }
329            translate_chunk(chunk, out, table);
330        }
331        #[cfg(not(target_arch = "x86_64"))]
332        TranslateKind::RangeDelta { .. } => {
333            translate_chunk(chunk, out, table);
334        }
335        TranslateKind::General => {
336            translate_chunk(chunk, out, table);
337        }
338    }
339}
340
341/// In-place translate dispatch (for stdin path).
342#[inline]
343fn translate_inplace_dispatch(data: &mut [u8], table: &[u8; 256], kind: &TranslateKind) {
344    match kind {
345        TranslateKind::Identity => {}
346        #[cfg(target_arch = "x86_64")]
347        TranslateKind::RangeDelta { lo, hi, delta } => {
348            if has_avx2() {
349                unsafe { simd_tr::range_delta_inplace(data, *lo, *hi, *delta) };
350                return;
351            }
352            translate_inplace(data, table);
353        }
354        #[cfg(not(target_arch = "x86_64"))]
355        TranslateKind::RangeDelta { .. } => {
356            translate_inplace(data, table);
357        }
358        TranslateKind::General => {
359            translate_inplace(data, table);
360        }
361    }
362}
363
364// ============================================================================
365// Streaming functions (Read + Write)
366// ============================================================================
367
368pub fn translate(
369    set1: &[u8],
370    set2: &[u8],
371    reader: &mut impl Read,
372    writer: &mut impl Write,
373) -> io::Result<()> {
374    let table = build_translate_table(set1, set2);
375    let kind = analyze_table(&table);
376
377    // Identity: just copy stdin to stdout
378    if matches!(kind, TranslateKind::Identity) {
379        let mut buf = vec![0u8; BUF_SIZE];
380        loop {
381            let n = fill_buf(reader, &mut buf)?;
382            if n == 0 {
383                break;
384            }
385            writer.write_all(&buf[..n])?;
386        }
387        return Ok(());
388    }
389
390    // In-place translate: read into buffer, translate in-place, write
391    let mut buf = vec![0u8; BUF_SIZE];
392    loop {
393        let n = fill_buf(reader, &mut buf)?;
394        if n == 0 {
395            break;
396        }
397        translate_inplace_dispatch(&mut buf[..n], &table, &kind);
398        writer.write_all(&buf[..n])?;
399    }
400    Ok(())
401}
402
403pub fn translate_squeeze(
404    set1: &[u8],
405    set2: &[u8],
406    reader: &mut impl Read,
407    writer: &mut impl Write,
408) -> io::Result<()> {
409    let table = build_translate_table(set1, set2);
410    let squeeze_set = build_member_set(set2);
411    let mut outbuf = vec![0u8; BUF_SIZE];
412    let mut inbuf = vec![0u8; BUF_SIZE];
413    let mut last_squeezed: u16 = 256;
414
415    loop {
416        let n = fill_buf(reader, &mut inbuf)?;
417        if n == 0 {
418            break;
419        }
420        let mut out_pos = 0;
421        for &b in &inbuf[..n] {
422            let translated = unsafe { *table.get_unchecked(b as usize) };
423            if is_member(&squeeze_set, translated) {
424                if last_squeezed == translated as u16 {
425                    continue;
426                }
427                last_squeezed = translated as u16;
428            } else {
429                last_squeezed = 256;
430            }
431            unsafe {
432                *outbuf.get_unchecked_mut(out_pos) = translated;
433            }
434            out_pos += 1;
435        }
436        writer.write_all(&outbuf[..out_pos])?;
437    }
438    Ok(())
439}
440
441pub fn delete(
442    delete_chars: &[u8],
443    reader: &mut impl Read,
444    writer: &mut impl Write,
445) -> io::Result<()> {
446    // Fast path: single character delete using SIMD memchr
447    if delete_chars.len() == 1 {
448        return delete_single_streaming(delete_chars[0], reader, writer);
449    }
450
451    let member = build_member_set(delete_chars);
452    let mut outbuf = vec![0u8; BUF_SIZE];
453    let mut inbuf = vec![0u8; BUF_SIZE];
454
455    loop {
456        let n = fill_buf(reader, &mut inbuf)?;
457        if n == 0 {
458            break;
459        }
460        let mut out_pos = 0;
461        for &b in &inbuf[..n] {
462            if !is_member(&member, b) {
463                unsafe {
464                    *outbuf.get_unchecked_mut(out_pos) = b;
465                }
466                out_pos += 1;
467            }
468        }
469        writer.write_all(&outbuf[..out_pos])?;
470    }
471    Ok(())
472}
473
474/// Single-character delete from a reader using SIMD memchr scanning.
475fn delete_single_streaming(
476    ch: u8,
477    reader: &mut impl Read,
478    writer: &mut impl Write,
479) -> io::Result<()> {
480    let mut buf = vec![0u8; BUF_SIZE];
481    loop {
482        let n = fill_buf(reader, &mut buf)?;
483        if n == 0 {
484            break;
485        }
486        let chunk = &buf[..n];
487        let mut last = 0;
488        for pos in memchr::memchr_iter(ch, chunk) {
489            if pos > last {
490                writer.write_all(&chunk[last..pos])?;
491            }
492            last = pos + 1;
493        }
494        if last < n {
495            writer.write_all(&chunk[last..n])?;
496        }
497    }
498    Ok(())
499}
500
501pub fn delete_squeeze(
502    delete_chars: &[u8],
503    squeeze_chars: &[u8],
504    reader: &mut impl Read,
505    writer: &mut impl Write,
506) -> io::Result<()> {
507    let delete_set = build_member_set(delete_chars);
508    let squeeze_set = build_member_set(squeeze_chars);
509    let mut outbuf = vec![0u8; BUF_SIZE];
510    let mut inbuf = vec![0u8; BUF_SIZE];
511    let mut last_squeezed: u16 = 256;
512
513    loop {
514        let n = fill_buf(reader, &mut inbuf)?;
515        if n == 0 {
516            break;
517        }
518        let mut out_pos = 0;
519        for &b in &inbuf[..n] {
520            if is_member(&delete_set, b) {
521                continue;
522            }
523            if is_member(&squeeze_set, b) {
524                if last_squeezed == b as u16 {
525                    continue;
526                }
527                last_squeezed = b as u16;
528            } else {
529                last_squeezed = 256;
530            }
531            unsafe {
532                *outbuf.get_unchecked_mut(out_pos) = b;
533            }
534            out_pos += 1;
535        }
536        writer.write_all(&outbuf[..out_pos])?;
537    }
538    Ok(())
539}
540
541pub fn squeeze(
542    squeeze_chars: &[u8],
543    reader: &mut impl Read,
544    writer: &mut impl Write,
545) -> io::Result<()> {
546    let member = build_member_set(squeeze_chars);
547    let mut outbuf = vec![0u8; BUF_SIZE];
548    let mut inbuf = vec![0u8; BUF_SIZE];
549    let mut last_squeezed: u16 = 256;
550
551    loop {
552        let n = fill_buf(reader, &mut inbuf)?;
553        if n == 0 {
554            break;
555        }
556        let mut out_pos = 0;
557        for &b in &inbuf[..n] {
558            if is_member(&member, b) {
559                if last_squeezed == b as u16 {
560                    continue;
561                }
562                last_squeezed = b as u16;
563            } else {
564                last_squeezed = 256;
565            }
566            unsafe {
567                *outbuf.get_unchecked_mut(out_pos) = b;
568            }
569            out_pos += 1;
570        }
571        writer.write_all(&outbuf[..out_pos])?;
572    }
573    Ok(())
574}
575
576// ============================================================================
577// Mmap-based functions (zero-copy input from byte slice)
578// ============================================================================
579
580/// Translate bytes from an mmap'd byte slice — zero syscall reads.
581/// Uses SIMD AVX2 for range-delta patterns (e.g., a-z → A-Z).
582/// For large inputs, translates in parallel using rayon for maximum throughput.
583pub fn translate_mmap(
584    set1: &[u8],
585    set2: &[u8],
586    data: &[u8],
587    writer: &mut impl Write,
588) -> io::Result<()> {
589    let table = build_translate_table(set1, set2);
590    let kind = analyze_table(&table);
591
592    if matches!(kind, TranslateKind::Identity) {
593        return writer.write_all(data);
594    }
595
596    // Parallel translation for large inputs — each chunk is independent
597    if data.len() >= PARALLEL_TRANSLATE_THRESHOLD {
598        let num_threads = rayon::current_num_threads().max(1);
599        let chunk_size = (data.len() + num_threads - 1) / num_threads;
600        // Align to BUF_SIZE boundaries for cache efficiency
601        let chunk_size = ((chunk_size + BUF_SIZE - 1) / BUF_SIZE) * BUF_SIZE;
602
603        // Translate all chunks in parallel
604        let translated: Vec<Vec<u8>> = data
605            .par_chunks(chunk_size)
606            .map(|chunk| {
607                let mut out = vec![0u8; chunk.len()];
608                translate_chunk_dispatch(chunk, &mut out, &table, &kind);
609                out
610            })
611            .collect();
612
613        // Write sequentially to preserve order
614        for chunk in &translated {
615            writer.write_all(chunk)?;
616        }
617        return Ok(());
618    }
619
620    // Sequential path for smaller data
621    let mut out = vec![0u8; BUF_SIZE];
622    for chunk in data.chunks(BUF_SIZE) {
623        translate_chunk_dispatch(chunk, &mut out[..chunk.len()], &table, &kind);
624        writer.write_all(&out[..chunk.len()])?;
625    }
626    Ok(())
627}
628
629/// Translate + squeeze from mmap'd byte slice.
630pub fn translate_squeeze_mmap(
631    set1: &[u8],
632    set2: &[u8],
633    data: &[u8],
634    writer: &mut impl Write,
635) -> io::Result<()> {
636    let table = build_translate_table(set1, set2);
637    let squeeze_set = build_member_set(set2);
638    let mut outbuf = vec![0u8; BUF_SIZE];
639    let mut last_squeezed: u16 = 256;
640
641    for chunk in data.chunks(BUF_SIZE) {
642        let mut out_pos = 0;
643        for &b in chunk {
644            let translated = unsafe { *table.get_unchecked(b as usize) };
645            if is_member(&squeeze_set, translated) {
646                if last_squeezed == translated as u16 {
647                    continue;
648                }
649                last_squeezed = translated as u16;
650            } else {
651                last_squeezed = 256;
652            }
653            unsafe {
654                *outbuf.get_unchecked_mut(out_pos) = translated;
655            }
656            out_pos += 1;
657        }
658        writer.write_all(&outbuf[..out_pos])?;
659    }
660    Ok(())
661}
662
663/// Delete from mmap'd byte slice.
664/// Uses SIMD memchr for single-character delete (common case).
665pub fn delete_mmap(delete_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
666    // Fast path: single character delete uses SIMD memchr
667    if delete_chars.len() == 1 {
668        return delete_single_char_mmap(delete_chars[0], data, writer);
669    }
670
671    let member = build_member_set(delete_chars);
672    let mut outbuf = vec![0u8; BUF_SIZE];
673
674    for chunk in data.chunks(BUF_SIZE) {
675        let mut out_pos = 0;
676        for &b in chunk {
677            if !is_member(&member, b) {
678                unsafe {
679                    *outbuf.get_unchecked_mut(out_pos) = b;
680                }
681                out_pos += 1;
682            }
683        }
684        writer.write_all(&outbuf[..out_pos])?;
685    }
686    Ok(())
687}
688
689/// Single-character delete from mmap using SIMD memchr.
690/// Copies runs of non-matching bytes in bulk (memcpy), far faster than byte-at-a-time.
691fn delete_single_char_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
692    let mut last = 0;
693    for pos in memchr::memchr_iter(ch, data) {
694        if pos > last {
695            writer.write_all(&data[last..pos])?;
696        }
697        last = pos + 1;
698    }
699    if last < data.len() {
700        writer.write_all(&data[last..])?;
701    }
702    Ok(())
703}
704
705/// Delete + squeeze from mmap'd byte slice.
706pub fn delete_squeeze_mmap(
707    delete_chars: &[u8],
708    squeeze_chars: &[u8],
709    data: &[u8],
710    writer: &mut impl Write,
711) -> io::Result<()> {
712    let delete_set = build_member_set(delete_chars);
713    let squeeze_set = build_member_set(squeeze_chars);
714    let mut outbuf = vec![0u8; BUF_SIZE];
715    let mut last_squeezed: u16 = 256;
716
717    for chunk in data.chunks(BUF_SIZE) {
718        let mut out_pos = 0;
719        for &b in chunk {
720            if is_member(&delete_set, b) {
721                continue;
722            }
723            if is_member(&squeeze_set, b) {
724                if last_squeezed == b as u16 {
725                    continue;
726                }
727                last_squeezed = b as u16;
728            } else {
729                last_squeezed = 256;
730            }
731            unsafe {
732                *outbuf.get_unchecked_mut(out_pos) = b;
733            }
734            out_pos += 1;
735        }
736        writer.write_all(&outbuf[..out_pos])?;
737    }
738    Ok(())
739}
740
741/// Squeeze from mmap'd byte slice.
742pub fn squeeze_mmap(squeeze_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
743    let member = build_member_set(squeeze_chars);
744    let mut outbuf = vec![0u8; BUF_SIZE];
745    let mut last_squeezed: u16 = 256;
746
747    for chunk in data.chunks(BUF_SIZE) {
748        let mut out_pos = 0;
749        for &b in chunk {
750            if is_member(&member, b) {
751                if last_squeezed == b as u16 {
752                    continue;
753                }
754                last_squeezed = b as u16;
755            } else {
756                last_squeezed = 256;
757            }
758            unsafe {
759                *outbuf.get_unchecked_mut(out_pos) = b;
760            }
761            out_pos += 1;
762        }
763        writer.write_all(&outbuf[..out_pos])?;
764    }
765    Ok(())
766}