Skip to main content

coreutils_rs/tr/
core.rs

1use std::io::{self, Read, Write};
2
3/// Main processing buffer: 32MB.
4const BUF_SIZE: usize = 32 * 1024 * 1024;
5
6/// Stream buffer: 16MB.
7const STREAM_BUF: usize = 16 * 1024 * 1024;
8
9/// Build a 256-byte lookup table mapping set1[i] -> set2[i].
10#[inline]
11fn build_translate_table(set1: &[u8], set2: &[u8]) -> [u8; 256] {
12    let mut table: [u8; 256] = std::array::from_fn(|i| i as u8);
13    let last = set2.last().copied();
14    for (i, &from) in set1.iter().enumerate() {
15        table[from as usize] = if i < set2.len() {
16            set2[i]
17        } else {
18            last.unwrap_or(from)
19        };
20    }
21    table
22}
23
24/// Build a 256-bit (32-byte) membership set for O(1) byte lookup.
25#[inline]
26fn build_member_set(chars: &[u8]) -> [u8; 32] {
27    let mut set = [0u8; 32];
28    for &ch in chars {
29        set[ch as usize >> 3] |= 1 << (ch & 7);
30    }
31    set
32}
33
34#[inline(always)]
35fn is_member(set: &[u8; 32], ch: u8) -> bool {
36    unsafe { (*set.get_unchecked(ch as usize >> 3) & (1 << (ch & 7))) != 0 }
37}
38
39/// Translate bytes in-place using a 256-byte lookup table.
40#[inline(always)]
41fn translate_inplace(data: &mut [u8], table: &[u8; 256]) {
42    for b in data.iter_mut() {
43        *b = unsafe { *table.get_unchecked(*b as usize) };
44    }
45}
46
47/// Translate bytes from source to destination using a 256-byte lookup table.
48#[inline(always)]
49fn translate_to(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
50    debug_assert!(dst.len() >= src.len());
51    unsafe {
52        let sp = src.as_ptr();
53        let dp = dst.as_mut_ptr();
54        let len = src.len();
55        let mut i = 0;
56        while i + 8 <= len {
57            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
58            *dp.add(i + 1) = *table.get_unchecked(*sp.add(i + 1) as usize);
59            *dp.add(i + 2) = *table.get_unchecked(*sp.add(i + 2) as usize);
60            *dp.add(i + 3) = *table.get_unchecked(*sp.add(i + 3) as usize);
61            *dp.add(i + 4) = *table.get_unchecked(*sp.add(i + 4) as usize);
62            *dp.add(i + 5) = *table.get_unchecked(*sp.add(i + 5) as usize);
63            *dp.add(i + 6) = *table.get_unchecked(*sp.add(i + 6) as usize);
64            *dp.add(i + 7) = *table.get_unchecked(*sp.add(i + 7) as usize);
65            i += 8;
66        }
67        while i < len {
68            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
69            i += 1;
70        }
71    }
72}
73
74// ============================================================================
75// SIMD range translation (x86_64)
76// ============================================================================
77
78/// Detect if the translate table is a single contiguous range with constant offset.
79/// Returns Some((lo, hi, offset)) if all non-identity entries form [lo..=hi] with
80/// table[i] = i + offset for all i in [lo, hi].
81#[inline]
82fn detect_range_offset(table: &[u8; 256]) -> Option<(u8, u8, i8)> {
83    let mut lo: Option<u8> = None;
84    let mut hi = 0u8;
85    let mut offset = 0i16;
86
87    for i in 0..256 {
88        if table[i] != i as u8 {
89            let diff = table[i] as i16 - i as i16;
90            match lo {
91                None => {
92                    lo = Some(i as u8);
93                    hi = i as u8;
94                    offset = diff;
95                }
96                Some(_) => {
97                    if diff != offset || i as u8 != hi.wrapping_add(1) {
98                        return None;
99                    }
100                    hi = i as u8;
101                }
102            }
103        }
104    }
105
106    lo.map(|l| (l, hi, offset as i8))
107}
108
109/// SIMD-accelerated range translation for mmap'd data.
110/// For tables where only a contiguous range [lo..=hi] is translated by a constant offset,
111/// uses AVX2 (32 bytes/iter) or SSE2 (16 bytes/iter) vectorized arithmetic.
112#[cfg(target_arch = "x86_64")]
113fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
114    if is_x86_feature_detected!("avx2") {
115        unsafe { translate_range_avx2(src, dst, lo, hi, offset) };
116    } else {
117        unsafe { translate_range_sse2(src, dst, lo, hi, offset) };
118    }
119}
120
121#[cfg(target_arch = "x86_64")]
122#[target_feature(enable = "avx2")]
123unsafe fn translate_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
124    use std::arch::x86_64::*;
125
126    unsafe {
127        let range = hi - lo;
128        // Bias: shift range so lo maps to -128 (signed min).
129        // For input in [lo, hi]: biased = input + (0x80 - lo) is in [-128, -128+range].
130        // For input < lo: biased wraps to large positive (signed), > threshold.
131        // For input > hi: biased > -128+range, > threshold.
132        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
133        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
134        let offset_v = _mm256_set1_epi8(offset);
135        let zero = _mm256_setzero_si256();
136
137        let len = src.len();
138        let mut i = 0;
139
140        while i + 32 <= len {
141            let input = _mm256_loadu_si256(src.as_ptr().add(i) as *const _);
142            let biased = _mm256_add_epi8(input, bias_v);
143            // gt = 0xFF where biased > threshold (OUT of range)
144            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
145            // mask = 0xFF where IN range (NOT gt)
146            let mask = _mm256_cmpeq_epi8(gt, zero);
147            let offset_masked = _mm256_and_si256(mask, offset_v);
148            let result = _mm256_add_epi8(input, offset_masked);
149            _mm256_storeu_si256(dst.as_mut_ptr().add(i) as *mut _, result);
150            i += 32;
151        }
152
153        // SSE2 tail for 16-byte remainder
154        if i + 16 <= len {
155            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
156            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
157            let offset_v128 = _mm_set1_epi8(offset);
158            let zero128 = _mm_setzero_si128();
159
160            let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
161            let biased = _mm_add_epi8(input, bias_v128);
162            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
163            let mask = _mm_cmpeq_epi8(gt, zero128);
164            let offset_masked = _mm_and_si128(mask, offset_v128);
165            let result = _mm_add_epi8(input, offset_masked);
166            _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
167            i += 16;
168        }
169
170        // Scalar tail
171        while i < len {
172            let b = *src.get_unchecked(i);
173            *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
174                b.wrapping_add(offset as u8)
175            } else {
176                b
177            };
178            i += 1;
179        }
180    }
181}
182
183#[cfg(target_arch = "x86_64")]
184#[target_feature(enable = "sse2")]
185unsafe fn translate_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
186    use std::arch::x86_64::*;
187
188    unsafe {
189        let range = hi - lo;
190        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
191        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
192        let offset_v = _mm_set1_epi8(offset);
193        let zero = _mm_setzero_si128();
194
195        let len = src.len();
196        let mut i = 0;
197
198        while i + 16 <= len {
199            let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
200            let biased = _mm_add_epi8(input, bias_v);
201            let gt = _mm_cmpgt_epi8(biased, threshold_v);
202            let mask = _mm_cmpeq_epi8(gt, zero);
203            let offset_masked = _mm_and_si128(mask, offset_v);
204            let result = _mm_add_epi8(input, offset_masked);
205            _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
206            i += 16;
207        }
208
209        while i < len {
210            let b = *src.get_unchecked(i);
211            *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
212                b.wrapping_add(offset as u8)
213            } else {
214                b
215            };
216            i += 1;
217        }
218    }
219}
220
221/// Scalar range translation fallback for non-x86_64.
222#[cfg(not(target_arch = "x86_64"))]
223fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
224    for (i, &b) in src.iter().enumerate() {
225        dst[i] = if b >= lo && b <= hi {
226            b.wrapping_add(offset as u8)
227        } else {
228            b
229        };
230    }
231}
232
233// ============================================================================
234// In-place SIMD range translation (saves one buffer allocation in streaming)
235// ============================================================================
236
237/// In-place SIMD-accelerated range translation.
238/// Translates bytes in [lo..=hi] by adding `offset`, leaving others unchanged.
239/// Operates on the buffer in-place, eliminating the need for a separate output buffer.
240#[cfg(target_arch = "x86_64")]
241fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
242    if is_x86_feature_detected!("avx2") {
243        unsafe { translate_range_avx2_inplace(data, lo, hi, offset) };
244    } else {
245        unsafe { translate_range_sse2_inplace(data, lo, hi, offset) };
246    }
247}
248
249#[cfg(target_arch = "x86_64")]
250#[target_feature(enable = "avx2")]
251unsafe fn translate_range_avx2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
252    use std::arch::x86_64::*;
253
254    unsafe {
255        let range = hi - lo;
256        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
257        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
258        let offset_v = _mm256_set1_epi8(offset);
259        let zero = _mm256_setzero_si256();
260
261        let len = data.len();
262        let ptr = data.as_mut_ptr();
263        let mut i = 0;
264
265        while i + 32 <= len {
266            let input = _mm256_loadu_si256(ptr.add(i) as *const _);
267            let biased = _mm256_add_epi8(input, bias_v);
268            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
269            let mask = _mm256_cmpeq_epi8(gt, zero);
270            let offset_masked = _mm256_and_si256(mask, offset_v);
271            let result = _mm256_add_epi8(input, offset_masked);
272            _mm256_storeu_si256(ptr.add(i) as *mut _, result);
273            i += 32;
274        }
275
276        if i + 16 <= len {
277            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
278            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
279            let offset_v128 = _mm_set1_epi8(offset);
280            let zero128 = _mm_setzero_si128();
281
282            let input = _mm_loadu_si128(ptr.add(i) as *const _);
283            let biased = _mm_add_epi8(input, bias_v128);
284            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
285            let mask = _mm_cmpeq_epi8(gt, zero128);
286            let offset_masked = _mm_and_si128(mask, offset_v128);
287            let result = _mm_add_epi8(input, offset_masked);
288            _mm_storeu_si128(ptr.add(i) as *mut _, result);
289            i += 16;
290        }
291
292        while i < len {
293            let b = *ptr.add(i);
294            *ptr.add(i) = if b >= lo && b <= hi {
295                b.wrapping_add(offset as u8)
296            } else {
297                b
298            };
299            i += 1;
300        }
301    }
302}
303
304#[cfg(target_arch = "x86_64")]
305#[target_feature(enable = "sse2")]
306unsafe fn translate_range_sse2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
307    use std::arch::x86_64::*;
308
309    unsafe {
310        let range = hi - lo;
311        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
312        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
313        let offset_v = _mm_set1_epi8(offset);
314        let zero = _mm_setzero_si128();
315
316        let len = data.len();
317        let ptr = data.as_mut_ptr();
318        let mut i = 0;
319
320        while i + 16 <= len {
321            let input = _mm_loadu_si128(ptr.add(i) as *const _);
322            let biased = _mm_add_epi8(input, bias_v);
323            let gt = _mm_cmpgt_epi8(biased, threshold_v);
324            let mask = _mm_cmpeq_epi8(gt, zero);
325            let offset_masked = _mm_and_si128(mask, offset_v);
326            let result = _mm_add_epi8(input, offset_masked);
327            _mm_storeu_si128(ptr.add(i) as *mut _, result);
328            i += 16;
329        }
330
331        while i < len {
332            let b = *ptr.add(i);
333            *ptr.add(i) = if b >= lo && b <= hi {
334                b.wrapping_add(offset as u8)
335            } else {
336                b
337            };
338            i += 1;
339        }
340    }
341}
342
343#[cfg(not(target_arch = "x86_64"))]
344fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
345    for b in data.iter_mut() {
346        if *b >= lo && *b <= hi {
347            *b = b.wrapping_add(offset as u8);
348        }
349    }
350}
351
352// ============================================================================
353// Streaming functions (Read + Write)
354// ============================================================================
355
356pub fn translate(
357    set1: &[u8],
358    set2: &[u8],
359    reader: &mut impl Read,
360    writer: &mut impl Write,
361) -> io::Result<()> {
362    let table = build_translate_table(set1, set2);
363
364    // Try SIMD fast path for range translations (in-place, single buffer)
365    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
366        return translate_range_stream(lo, hi, offset, reader, writer);
367    }
368
369    let mut buf = vec![0u8; STREAM_BUF];
370    loop {
371        let n = read_full(reader, &mut buf)?;
372        if n == 0 {
373            break;
374        }
375        translate_inplace(&mut buf[..n], &table);
376        writer.write_all(&buf[..n])?;
377    }
378    Ok(())
379}
380
381/// Streaming SIMD range translation — single buffer, in-place transform.
382/// Saves 16MB allocation + memcpy vs separate src/dst buffers.
383fn translate_range_stream(
384    lo: u8,
385    hi: u8,
386    offset: i8,
387    reader: &mut impl Read,
388    writer: &mut impl Write,
389) -> io::Result<()> {
390    let mut buf = vec![0u8; STREAM_BUF];
391    loop {
392        let n = read_full(reader, &mut buf)?;
393        if n == 0 {
394            break;
395        }
396        translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
397        writer.write_all(&buf[..n])?;
398    }
399    Ok(())
400}
401
402/// Read as many bytes as possible into buf, retrying on partial reads.
403#[inline]
404fn read_full(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
405    let mut total = 0;
406    while total < buf.len() {
407        match reader.read(&mut buf[total..]) {
408            Ok(0) => break,
409            Ok(n) => total += n,
410            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
411            Err(e) => return Err(e),
412        }
413    }
414    Ok(total)
415}
416
417pub fn translate_squeeze(
418    set1: &[u8],
419    set2: &[u8],
420    reader: &mut impl Read,
421    writer: &mut impl Write,
422) -> io::Result<()> {
423    let table = build_translate_table(set1, set2);
424    let squeeze_set = build_member_set(set2);
425
426    let mut buf = vec![0u8; STREAM_BUF];
427    let mut last_squeezed: u16 = 256;
428
429    loop {
430        let n = match reader.read(&mut buf) {
431            Ok(0) => break,
432            Ok(n) => n,
433            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
434            Err(e) => return Err(e),
435        };
436        translate_inplace(&mut buf[..n], &table);
437        let mut wp = 0;
438        unsafe {
439            let ptr = buf.as_mut_ptr();
440            for i in 0..n {
441                let b = *ptr.add(i);
442                if is_member(&squeeze_set, b) {
443                    if last_squeezed == b as u16 {
444                        continue;
445                    }
446                    last_squeezed = b as u16;
447                } else {
448                    last_squeezed = 256;
449                }
450                *ptr.add(wp) = b;
451                wp += 1;
452            }
453        }
454        writer.write_all(&buf[..wp])?;
455    }
456    Ok(())
457}
458
459pub fn delete(
460    delete_chars: &[u8],
461    reader: &mut impl Read,
462    writer: &mut impl Write,
463) -> io::Result<()> {
464    if delete_chars.len() == 1 {
465        return delete_single_streaming(delete_chars[0], reader, writer);
466    }
467    if delete_chars.len() <= 3 {
468        return delete_multi_streaming(delete_chars, reader, writer);
469    }
470
471    let member = build_member_set(delete_chars);
472    let mut buf = vec![0u8; STREAM_BUF];
473
474    loop {
475        let n = match reader.read(&mut buf) {
476            Ok(0) => break,
477            Ok(n) => n,
478            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
479            Err(e) => return Err(e),
480        };
481        let mut wp = 0;
482        unsafe {
483            let ptr = buf.as_mut_ptr();
484            let mut i = 0;
485            while i + 8 <= n {
486                let b0 = *ptr.add(i);
487                let b1 = *ptr.add(i + 1);
488                let b2 = *ptr.add(i + 2);
489                let b3 = *ptr.add(i + 3);
490                let b4 = *ptr.add(i + 4);
491                let b5 = *ptr.add(i + 5);
492                let b6 = *ptr.add(i + 6);
493                let b7 = *ptr.add(i + 7);
494
495                if !is_member(&member, b0) {
496                    *ptr.add(wp) = b0;
497                    wp += 1;
498                }
499                if !is_member(&member, b1) {
500                    *ptr.add(wp) = b1;
501                    wp += 1;
502                }
503                if !is_member(&member, b2) {
504                    *ptr.add(wp) = b2;
505                    wp += 1;
506                }
507                if !is_member(&member, b3) {
508                    *ptr.add(wp) = b3;
509                    wp += 1;
510                }
511                if !is_member(&member, b4) {
512                    *ptr.add(wp) = b4;
513                    wp += 1;
514                }
515                if !is_member(&member, b5) {
516                    *ptr.add(wp) = b5;
517                    wp += 1;
518                }
519                if !is_member(&member, b6) {
520                    *ptr.add(wp) = b6;
521                    wp += 1;
522                }
523                if !is_member(&member, b7) {
524                    *ptr.add(wp) = b7;
525                    wp += 1;
526                }
527                i += 8;
528            }
529            while i < n {
530                let b = *ptr.add(i);
531                if !is_member(&member, b) {
532                    *ptr.add(wp) = b;
533                    wp += 1;
534                }
535                i += 1;
536            }
537        }
538        writer.write_all(&buf[..wp])?;
539    }
540    Ok(())
541}
542
543fn delete_single_streaming(
544    ch: u8,
545    reader: &mut impl Read,
546    writer: &mut impl Write,
547) -> io::Result<()> {
548    let mut buf = vec![0u8; STREAM_BUF];
549    loop {
550        let n = match reader.read(&mut buf) {
551            Ok(0) => break,
552            Ok(n) => n,
553            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
554            Err(e) => return Err(e),
555        };
556        let mut wp = 0;
557        let mut i = 0;
558        while i < n {
559            match memchr::memchr(ch, &buf[i..n]) {
560                Some(offset) => {
561                    if offset > 0 {
562                        if wp != i {
563                            unsafe {
564                                std::ptr::copy(
565                                    buf.as_ptr().add(i),
566                                    buf.as_mut_ptr().add(wp),
567                                    offset,
568                                );
569                            }
570                        }
571                        wp += offset;
572                    }
573                    i += offset + 1;
574                }
575                None => {
576                    let run_len = n - i;
577                    if run_len > 0 {
578                        if wp != i {
579                            unsafe {
580                                std::ptr::copy(
581                                    buf.as_ptr().add(i),
582                                    buf.as_mut_ptr().add(wp),
583                                    run_len,
584                                );
585                            }
586                        }
587                        wp += run_len;
588                    }
589                    break;
590                }
591            }
592        }
593        writer.write_all(&buf[..wp])?;
594    }
595    Ok(())
596}
597
598fn delete_multi_streaming(
599    chars: &[u8],
600    reader: &mut impl Read,
601    writer: &mut impl Write,
602) -> io::Result<()> {
603    let mut buf = vec![0u8; STREAM_BUF];
604    loop {
605        let n = match reader.read(&mut buf) {
606            Ok(0) => break,
607            Ok(n) => n,
608            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
609            Err(e) => return Err(e),
610        };
611        let mut wp = 0;
612        let mut i = 0;
613        while i < n {
614            let found = if chars.len() == 2 {
615                memchr::memchr2(chars[0], chars[1], &buf[i..n])
616            } else {
617                memchr::memchr3(chars[0], chars[1], chars[2], &buf[i..n])
618            };
619            match found {
620                Some(offset) => {
621                    if offset > 0 {
622                        if wp != i {
623                            unsafe {
624                                std::ptr::copy(
625                                    buf.as_ptr().add(i),
626                                    buf.as_mut_ptr().add(wp),
627                                    offset,
628                                );
629                            }
630                        }
631                        wp += offset;
632                    }
633                    i += offset + 1;
634                }
635                None => {
636                    let run_len = n - i;
637                    if run_len > 0 {
638                        if wp != i {
639                            unsafe {
640                                std::ptr::copy(
641                                    buf.as_ptr().add(i),
642                                    buf.as_mut_ptr().add(wp),
643                                    run_len,
644                                );
645                            }
646                        }
647                        wp += run_len;
648                    }
649                    break;
650                }
651            }
652        }
653        writer.write_all(&buf[..wp])?;
654    }
655    Ok(())
656}
657
658pub fn delete_squeeze(
659    delete_chars: &[u8],
660    squeeze_chars: &[u8],
661    reader: &mut impl Read,
662    writer: &mut impl Write,
663) -> io::Result<()> {
664    let delete_set = build_member_set(delete_chars);
665    let squeeze_set = build_member_set(squeeze_chars);
666    let mut buf = vec![0u8; STREAM_BUF];
667    let mut last_squeezed: u16 = 256;
668
669    loop {
670        let n = match reader.read(&mut buf) {
671            Ok(0) => break,
672            Ok(n) => n,
673            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
674            Err(e) => return Err(e),
675        };
676        let mut wp = 0;
677        unsafe {
678            let ptr = buf.as_mut_ptr();
679            for i in 0..n {
680                let b = *ptr.add(i);
681                if is_member(&delete_set, b) {
682                    continue;
683                }
684                if is_member(&squeeze_set, b) {
685                    if last_squeezed == b as u16 {
686                        continue;
687                    }
688                    last_squeezed = b as u16;
689                } else {
690                    last_squeezed = 256;
691                }
692                *ptr.add(wp) = b;
693                wp += 1;
694            }
695        }
696        writer.write_all(&buf[..wp])?;
697    }
698    Ok(())
699}
700
701pub fn squeeze(
702    squeeze_chars: &[u8],
703    reader: &mut impl Read,
704    writer: &mut impl Write,
705) -> io::Result<()> {
706    if squeeze_chars.len() == 1 {
707        return squeeze_single_stream(squeeze_chars[0], reader, writer);
708    }
709
710    let member = build_member_set(squeeze_chars);
711    let mut buf = vec![0u8; STREAM_BUF];
712    let mut last_squeezed: u16 = 256;
713
714    loop {
715        let n = match reader.read(&mut buf) {
716            Ok(0) => break,
717            Ok(n) => n,
718            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
719            Err(e) => return Err(e),
720        };
721        let mut wp = 0;
722        unsafe {
723            let ptr = buf.as_mut_ptr();
724            for i in 0..n {
725                let b = *ptr.add(i);
726                if is_member(&member, b) {
727                    if last_squeezed == b as u16 {
728                        continue;
729                    }
730                    last_squeezed = b as u16;
731                } else {
732                    last_squeezed = 256;
733                }
734                *ptr.add(wp) = b;
735                wp += 1;
736            }
737        }
738        writer.write_all(&buf[..wp])?;
739    }
740    Ok(())
741}
742
743fn squeeze_single_stream(
744    ch: u8,
745    reader: &mut impl Read,
746    writer: &mut impl Write,
747) -> io::Result<()> {
748    let mut buf = vec![0u8; STREAM_BUF];
749    let mut was_squeeze_char = false;
750
751    loop {
752        let n = match reader.read(&mut buf) {
753            Ok(0) => break,
754            Ok(n) => n,
755            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
756            Err(e) => return Err(e),
757        };
758
759        let mut wp = 0;
760        let mut i = 0;
761
762        while i < n {
763            if was_squeeze_char && buf[i] == ch {
764                i += 1;
765                while i < n && buf[i] == ch {
766                    i += 1;
767                }
768                if i >= n {
769                    break;
770                }
771            }
772
773            match memchr::memchr(ch, &buf[i..n]) {
774                Some(offset) => {
775                    let run_len = offset;
776                    if run_len > 0 {
777                        if wp != i {
778                            unsafe {
779                                std::ptr::copy(
780                                    buf.as_ptr().add(i),
781                                    buf.as_mut_ptr().add(wp),
782                                    run_len,
783                                );
784                            }
785                        }
786                        wp += run_len;
787                    }
788                    i += run_len;
789
790                    unsafe {
791                        *buf.as_mut_ptr().add(wp) = ch;
792                    }
793                    wp += 1;
794                    was_squeeze_char = true;
795                    i += 1;
796                    while i < n && buf[i] == ch {
797                        i += 1;
798                    }
799                }
800                None => {
801                    let run_len = n - i;
802                    if run_len > 0 {
803                        if wp != i {
804                            unsafe {
805                                std::ptr::copy(
806                                    buf.as_ptr().add(i),
807                                    buf.as_mut_ptr().add(wp),
808                                    run_len,
809                                );
810                            }
811                        }
812                        wp += run_len;
813                    }
814                    was_squeeze_char = false;
815                    break;
816                }
817            }
818        }
819
820        writer.write_all(&buf[..wp])?;
821    }
822    Ok(())
823}
824
825// ============================================================================
826// Mmap-based functions (zero-copy input from byte slice)
827// ============================================================================
828
829/// Translate bytes from an mmap'd byte slice.
830/// Detects single-range translations (e.g., a-z→A-Z) and uses SIMD vectorized
831/// arithmetic (AVX2: 32 bytes/iter, SSE2: 16 bytes/iter) for those cases.
832/// Falls back to scalar 256-byte table lookup for general translations.
833pub fn translate_mmap(
834    set1: &[u8],
835    set2: &[u8],
836    data: &[u8],
837    writer: &mut impl Write,
838) -> io::Result<()> {
839    let table = build_translate_table(set1, set2);
840
841    // Check if table is identity — pure passthrough
842    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
843    if is_identity {
844        return writer.write_all(data);
845    }
846
847    // Try SIMD fast path for single-range constant-offset translations
848    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
849        let buf_size = data.len().min(BUF_SIZE);
850        let mut buf = vec![0u8; buf_size];
851        for chunk in data.chunks(buf_size) {
852            translate_range_simd(chunk, &mut buf[..chunk.len()], lo, hi, offset);
853            writer.write_all(&buf[..chunk.len()])?;
854        }
855        return Ok(());
856    }
857
858    // General case: scalar table lookup in chunks
859    let buf_size = data.len().min(BUF_SIZE);
860    let mut buf = vec![0u8; buf_size];
861    for chunk in data.chunks(buf_size) {
862        translate_to(chunk, &mut buf[..chunk.len()], &table);
863        writer.write_all(&buf[..chunk.len()])?;
864    }
865    Ok(())
866}
867
868/// Translate + squeeze from mmap'd byte slice.
869pub fn translate_squeeze_mmap(
870    set1: &[u8],
871    set2: &[u8],
872    data: &[u8],
873    writer: &mut impl Write,
874) -> io::Result<()> {
875    let table = build_translate_table(set1, set2);
876    let squeeze_set = build_member_set(set2);
877    let buf_size = data.len().min(BUF_SIZE);
878    let mut buf = vec![0u8; buf_size];
879    let mut last_squeezed: u16 = 256;
880
881    for chunk in data.chunks(buf_size) {
882        translate_to(chunk, &mut buf[..chunk.len()], &table);
883        let mut wp = 0;
884        unsafe {
885            let ptr = buf.as_mut_ptr();
886            for i in 0..chunk.len() {
887                let b = *ptr.add(i);
888                if is_member(&squeeze_set, b) {
889                    if last_squeezed == b as u16 {
890                        continue;
891                    }
892                    last_squeezed = b as u16;
893                } else {
894                    last_squeezed = 256;
895                }
896                *ptr.add(wp) = b;
897                wp += 1;
898            }
899        }
900        writer.write_all(&buf[..wp])?;
901    }
902    Ok(())
903}
904
905/// Delete from mmap'd byte slice.
906pub fn delete_mmap(delete_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
907    if delete_chars.len() == 1 {
908        return delete_single_char_mmap(delete_chars[0], data, writer);
909    }
910    if delete_chars.len() <= 3 {
911        return delete_multi_memchr_mmap(delete_chars, data, writer);
912    }
913
914    let member = build_member_set(delete_chars);
915    let buf_size = data.len().min(BUF_SIZE);
916    let mut outbuf = vec![0u8; buf_size];
917
918    for chunk in data.chunks(buf_size) {
919        let mut out_pos = 0;
920        let len = chunk.len();
921        let mut i = 0;
922
923        while i + 8 <= len {
924            unsafe {
925                let b0 = *chunk.get_unchecked(i);
926                let b1 = *chunk.get_unchecked(i + 1);
927                let b2 = *chunk.get_unchecked(i + 2);
928                let b3 = *chunk.get_unchecked(i + 3);
929                let b4 = *chunk.get_unchecked(i + 4);
930                let b5 = *chunk.get_unchecked(i + 5);
931                let b6 = *chunk.get_unchecked(i + 6);
932                let b7 = *chunk.get_unchecked(i + 7);
933
934                *outbuf.get_unchecked_mut(out_pos) = b0;
935                out_pos += !is_member(&member, b0) as usize;
936                *outbuf.get_unchecked_mut(out_pos) = b1;
937                out_pos += !is_member(&member, b1) as usize;
938                *outbuf.get_unchecked_mut(out_pos) = b2;
939                out_pos += !is_member(&member, b2) as usize;
940                *outbuf.get_unchecked_mut(out_pos) = b3;
941                out_pos += !is_member(&member, b3) as usize;
942                *outbuf.get_unchecked_mut(out_pos) = b4;
943                out_pos += !is_member(&member, b4) as usize;
944                *outbuf.get_unchecked_mut(out_pos) = b5;
945                out_pos += !is_member(&member, b5) as usize;
946                *outbuf.get_unchecked_mut(out_pos) = b6;
947                out_pos += !is_member(&member, b6) as usize;
948                *outbuf.get_unchecked_mut(out_pos) = b7;
949                out_pos += !is_member(&member, b7) as usize;
950            }
951            i += 8;
952        }
953
954        while i < len {
955            unsafe {
956                let b = *chunk.get_unchecked(i);
957                *outbuf.get_unchecked_mut(out_pos) = b;
958                out_pos += !is_member(&member, b) as usize;
959            }
960            i += 1;
961        }
962
963        writer.write_all(&outbuf[..out_pos])?;
964    }
965    Ok(())
966}
967
968fn delete_single_char_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
969    let buf_size = data.len().min(BUF_SIZE);
970    let mut outbuf = vec![0u8; buf_size];
971
972    for chunk in data.chunks(buf_size) {
973        let mut wp = 0;
974        let mut last = 0;
975        for pos in memchr::memchr_iter(ch, chunk) {
976            if pos > last {
977                let run = pos - last;
978                outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
979                wp += run;
980            }
981            last = pos + 1;
982        }
983        if last < chunk.len() {
984            let run = chunk.len() - last;
985            outbuf[wp..wp + run].copy_from_slice(&chunk[last..]);
986            wp += run;
987        }
988        writer.write_all(&outbuf[..wp])?;
989    }
990    Ok(())
991}
992
993fn delete_multi_memchr_mmap(chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
994    let c0 = chars[0];
995    let c1 = if chars.len() >= 2 { chars[1] } else { 0 };
996    let c2 = if chars.len() >= 3 { chars[2] } else { 0 };
997    let is_three = chars.len() >= 3;
998
999    let buf_size = data.len().min(BUF_SIZE);
1000    let mut outbuf = vec![0u8; buf_size];
1001
1002    for chunk in data.chunks(buf_size) {
1003        let mut wp = 0;
1004        let mut last = 0;
1005
1006        let iter_fn = |chunk: &[u8]| -> Vec<usize> {
1007            if is_three {
1008                memchr::memchr3_iter(c0, c1, c2, chunk).collect()
1009            } else {
1010                memchr::memchr2_iter(c0, c1, chunk).collect()
1011            }
1012        };
1013
1014        for pos in iter_fn(chunk) {
1015            if pos > last {
1016                let run = pos - last;
1017                outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
1018                wp += run;
1019            }
1020            last = pos + 1;
1021        }
1022
1023        if last < chunk.len() {
1024            let run = chunk.len() - last;
1025            outbuf[wp..wp + run].copy_from_slice(&chunk[last..]);
1026            wp += run;
1027        }
1028        writer.write_all(&outbuf[..wp])?;
1029    }
1030    Ok(())
1031}
1032
1033/// Delete + squeeze from mmap'd byte slice.
1034pub fn delete_squeeze_mmap(
1035    delete_chars: &[u8],
1036    squeeze_chars: &[u8],
1037    data: &[u8],
1038    writer: &mut impl Write,
1039) -> io::Result<()> {
1040    let delete_set = build_member_set(delete_chars);
1041    let squeeze_set = build_member_set(squeeze_chars);
1042    let buf_size = data.len().min(BUF_SIZE);
1043    let mut outbuf = vec![0u8; buf_size];
1044    let mut last_squeezed: u16 = 256;
1045
1046    for chunk in data.chunks(buf_size) {
1047        let mut out_pos = 0;
1048        for &b in chunk {
1049            if is_member(&delete_set, b) {
1050                continue;
1051            }
1052            if is_member(&squeeze_set, b) {
1053                if last_squeezed == b as u16 {
1054                    continue;
1055                }
1056                last_squeezed = b as u16;
1057            } else {
1058                last_squeezed = 256;
1059            }
1060            unsafe {
1061                *outbuf.get_unchecked_mut(out_pos) = b;
1062            }
1063            out_pos += 1;
1064        }
1065        writer.write_all(&outbuf[..out_pos])?;
1066    }
1067    Ok(())
1068}
1069
1070/// Squeeze from mmap'd byte slice.
1071pub fn squeeze_mmap(squeeze_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1072    if squeeze_chars.len() == 1 {
1073        return squeeze_single_mmap(squeeze_chars[0], data, writer);
1074    }
1075    if squeeze_chars.len() == 2 {
1076        return squeeze_multi_mmap::<2>(squeeze_chars, data, writer);
1077    }
1078    if squeeze_chars.len() == 3 {
1079        return squeeze_multi_mmap::<3>(squeeze_chars, data, writer);
1080    }
1081
1082    let member = build_member_set(squeeze_chars);
1083    let buf_size = data.len().min(BUF_SIZE);
1084    let mut outbuf = vec![0u8; buf_size];
1085    let mut last_squeezed: u16 = 256;
1086
1087    for chunk in data.chunks(buf_size) {
1088        let len = chunk.len();
1089        let mut wp = 0;
1090        let mut i = 0;
1091
1092        unsafe {
1093            let inp = chunk.as_ptr();
1094            let outp = outbuf.as_mut_ptr();
1095
1096            while i < len {
1097                let b = *inp.add(i);
1098                if is_member(&member, b) {
1099                    if last_squeezed != b as u16 {
1100                        *outp.add(wp) = b;
1101                        wp += 1;
1102                        last_squeezed = b as u16;
1103                    }
1104                    i += 1;
1105                    while i < len && *inp.add(i) == b {
1106                        i += 1;
1107                    }
1108                } else {
1109                    last_squeezed = 256;
1110                    *outp.add(wp) = b;
1111                    wp += 1;
1112                    i += 1;
1113                }
1114            }
1115        }
1116        writer.write_all(&outbuf[..wp])?;
1117    }
1118    Ok(())
1119}
1120
1121fn squeeze_multi_mmap<const N: usize>(
1122    chars: &[u8],
1123    data: &[u8],
1124    writer: &mut impl Write,
1125) -> io::Result<()> {
1126    let buf_size = data.len().min(BUF_SIZE);
1127    let mut outbuf = vec![0u8; buf_size];
1128    let mut wp = 0;
1129    let mut last_squeezed: u16 = 256;
1130    let mut cursor = 0;
1131
1132    macro_rules! find_next {
1133        ($data:expr) => {
1134            if N == 2 {
1135                memchr::memchr2(chars[0], chars[1], $data)
1136            } else {
1137                memchr::memchr3(chars[0], chars[1], chars[2], $data)
1138            }
1139        };
1140    }
1141
1142    macro_rules! flush_and_copy {
1143        ($src:expr, $len:expr) => {
1144            if wp + $len > buf_size {
1145                writer.write_all(&outbuf[..wp])?;
1146                wp = 0;
1147            }
1148            if $len > buf_size {
1149                writer.write_all($src)?;
1150            } else {
1151                outbuf[wp..wp + $len].copy_from_slice($src);
1152                wp += $len;
1153            }
1154        };
1155    }
1156
1157    while cursor < data.len() {
1158        match find_next!(&data[cursor..]) {
1159            Some(offset) => {
1160                let pos = cursor + offset;
1161                let b = data[pos];
1162                if pos > cursor {
1163                    let span = pos - cursor;
1164                    flush_and_copy!(&data[cursor..pos], span);
1165                    last_squeezed = 256;
1166                }
1167                if last_squeezed != b as u16 {
1168                    if wp >= buf_size {
1169                        writer.write_all(&outbuf[..wp])?;
1170                        wp = 0;
1171                    }
1172                    outbuf[wp] = b;
1173                    wp += 1;
1174                    last_squeezed = b as u16;
1175                }
1176                let mut skip = pos + 1;
1177                while skip < data.len() && data[skip] == b {
1178                    skip += 1;
1179                }
1180                cursor = skip;
1181            }
1182            None => {
1183                let remaining = data.len() - cursor;
1184                flush_and_copy!(&data[cursor..], remaining);
1185                break;
1186            }
1187        }
1188    }
1189    if wp > 0 {
1190        writer.write_all(&outbuf[..wp])?;
1191    }
1192    Ok(())
1193}
1194
1195fn squeeze_single_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1196    if data.is_empty() {
1197        return Ok(());
1198    }
1199
1200    if memchr::memmem::find(data, &[ch, ch]).is_none() {
1201        return writer.write_all(data);
1202    }
1203
1204    let buf_size = data.len().min(BUF_SIZE);
1205    let mut outbuf = vec![0u8; buf_size];
1206    let len = data.len();
1207    let mut wp = 0;
1208    let mut cursor = 0;
1209
1210    while cursor < len {
1211        match memchr::memchr(ch, &data[cursor..]) {
1212            Some(offset) => {
1213                let pos = cursor + offset;
1214                let gap = pos - cursor;
1215                if gap > 0 {
1216                    if wp + gap > buf_size {
1217                        writer.write_all(&outbuf[..wp])?;
1218                        wp = 0;
1219                    }
1220                    if gap > buf_size {
1221                        writer.write_all(&data[cursor..pos])?;
1222                    } else {
1223                        outbuf[wp..wp + gap].copy_from_slice(&data[cursor..pos]);
1224                        wp += gap;
1225                    }
1226                }
1227                if wp >= buf_size {
1228                    writer.write_all(&outbuf[..wp])?;
1229                    wp = 0;
1230                }
1231                outbuf[wp] = ch;
1232                wp += 1;
1233                cursor = pos + 1;
1234                while cursor < len && data[cursor] == ch {
1235                    cursor += 1;
1236                }
1237            }
1238            None => {
1239                let remaining = len - cursor;
1240                if remaining > 0 {
1241                    if wp + remaining > buf_size {
1242                        writer.write_all(&outbuf[..wp])?;
1243                        wp = 0;
1244                    }
1245                    if remaining > buf_size {
1246                        writer.write_all(&data[cursor..])?;
1247                    } else {
1248                        outbuf[wp..wp + remaining].copy_from_slice(&data[cursor..]);
1249                        wp += remaining;
1250                    }
1251                }
1252                break;
1253            }
1254        }
1255    }
1256
1257    if wp > 0 {
1258        writer.write_all(&outbuf[..wp])?;
1259    }
1260    Ok(())
1261}