Skip to main content

coreutils_rs/tr/
core.rs

1use std::io::{self, Read, Write};
2
3const BUF_SIZE: usize = 4 * 1024 * 1024; // 4MB — reduces syscall overhead
4
5/// Build a 256-byte lookup table mapping set1[i] -> set2[i].
6#[inline]
7fn build_translate_table(set1: &[u8], set2: &[u8]) -> [u8; 256] {
8    let mut table: [u8; 256] = std::array::from_fn(|i| i as u8);
9    let last = set2.last().copied();
10    for (i, &from) in set1.iter().enumerate() {
11        table[from as usize] = if i < set2.len() {
12            set2[i]
13        } else {
14            last.unwrap_or(from)
15        };
16    }
17    table
18}
19
20/// Build a 256-bit (32-byte) membership set for O(1) byte lookup.
21#[inline]
22fn build_member_set(chars: &[u8]) -> [u8; 32] {
23    let mut set = [0u8; 32];
24    for &ch in chars {
25        set[ch as usize >> 3] |= 1 << (ch & 7);
26    }
27    set
28}
29
30#[inline(always)]
31fn is_member(set: &[u8; 32], ch: u8) -> bool {
32    unsafe { (*set.get_unchecked(ch as usize >> 3) & (1 << (ch & 7))) != 0 }
33}
34
35/// Fill buffer completely from reader (handles short reads from pipes).
36fn fill_buf(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
37    let mut filled = 0;
38    while filled < buf.len() {
39        match reader.read(&mut buf[filled..]) {
40            Ok(0) => break,
41            Ok(n) => filled += n,
42            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
43            Err(e) => return Err(e),
44        }
45    }
46    Ok(filled)
47}
48
49/// Translate a chunk of bytes using a lookup table — unrolled 8-byte inner loop.
50#[inline(always)]
51fn translate_chunk(chunk: &[u8], out: &mut [u8], table: &[u8; 256]) {
52    let len = chunk.len();
53    let mut i = 0;
54    while i + 8 <= len {
55        unsafe {
56            *out.get_unchecked_mut(i) = *table.get_unchecked(*chunk.get_unchecked(i) as usize);
57            *out.get_unchecked_mut(i + 1) =
58                *table.get_unchecked(*chunk.get_unchecked(i + 1) as usize);
59            *out.get_unchecked_mut(i + 2) =
60                *table.get_unchecked(*chunk.get_unchecked(i + 2) as usize);
61            *out.get_unchecked_mut(i + 3) =
62                *table.get_unchecked(*chunk.get_unchecked(i + 3) as usize);
63            *out.get_unchecked_mut(i + 4) =
64                *table.get_unchecked(*chunk.get_unchecked(i + 4) as usize);
65            *out.get_unchecked_mut(i + 5) =
66                *table.get_unchecked(*chunk.get_unchecked(i + 5) as usize);
67            *out.get_unchecked_mut(i + 6) =
68                *table.get_unchecked(*chunk.get_unchecked(i + 6) as usize);
69            *out.get_unchecked_mut(i + 7) =
70                *table.get_unchecked(*chunk.get_unchecked(i + 7) as usize);
71        }
72        i += 8;
73    }
74    while i < len {
75        unsafe {
76            *out.get_unchecked_mut(i) = *table.get_unchecked(*chunk.get_unchecked(i) as usize);
77        }
78        i += 1;
79    }
80}
81
82// ============================================================================
83// Streaming functions (Read + Write)
84// ============================================================================
85
86pub fn translate(
87    set1: &[u8],
88    set2: &[u8],
89    reader: &mut impl Read,
90    writer: &mut impl Write,
91) -> io::Result<()> {
92    let table = build_translate_table(set1, set2);
93    let mut buf = vec![0u8; BUF_SIZE];
94    let mut out = vec![0u8; BUF_SIZE];
95    loop {
96        let n = fill_buf(reader, &mut buf)?;
97        if n == 0 {
98            break;
99        }
100        translate_chunk(&buf[..n], &mut out[..n], &table);
101        writer.write_all(&out[..n])?;
102    }
103    Ok(())
104}
105
106pub fn translate_squeeze(
107    set1: &[u8],
108    set2: &[u8],
109    reader: &mut impl Read,
110    writer: &mut impl Write,
111) -> io::Result<()> {
112    let table = build_translate_table(set1, set2);
113    let squeeze_set = build_member_set(set2);
114    let mut outbuf = vec![0u8; BUF_SIZE];
115    let mut inbuf = vec![0u8; BUF_SIZE];
116    let mut last_squeezed: u16 = 256;
117
118    loop {
119        let n = fill_buf(reader, &mut inbuf)?;
120        if n == 0 {
121            break;
122        }
123        let mut out_pos = 0;
124        for &b in &inbuf[..n] {
125            let translated = unsafe { *table.get_unchecked(b as usize) };
126            if is_member(&squeeze_set, translated) {
127                if last_squeezed == translated as u16 {
128                    continue;
129                }
130                last_squeezed = translated as u16;
131            } else {
132                last_squeezed = 256;
133            }
134            unsafe {
135                *outbuf.get_unchecked_mut(out_pos) = translated;
136            }
137            out_pos += 1;
138        }
139        writer.write_all(&outbuf[..out_pos])?;
140    }
141    Ok(())
142}
143
144pub fn delete(
145    delete_chars: &[u8],
146    reader: &mut impl Read,
147    writer: &mut impl Write,
148) -> io::Result<()> {
149    let member = build_member_set(delete_chars);
150    let mut outbuf = vec![0u8; BUF_SIZE];
151    let mut inbuf = vec![0u8; BUF_SIZE];
152
153    loop {
154        let n = fill_buf(reader, &mut inbuf)?;
155        if n == 0 {
156            break;
157        }
158        let mut out_pos = 0;
159        for &b in &inbuf[..n] {
160            if !is_member(&member, b) {
161                unsafe {
162                    *outbuf.get_unchecked_mut(out_pos) = b;
163                }
164                out_pos += 1;
165            }
166        }
167        writer.write_all(&outbuf[..out_pos])?;
168    }
169    Ok(())
170}
171
172pub fn delete_squeeze(
173    delete_chars: &[u8],
174    squeeze_chars: &[u8],
175    reader: &mut impl Read,
176    writer: &mut impl Write,
177) -> io::Result<()> {
178    let delete_set = build_member_set(delete_chars);
179    let squeeze_set = build_member_set(squeeze_chars);
180    let mut outbuf = vec![0u8; BUF_SIZE];
181    let mut inbuf = vec![0u8; BUF_SIZE];
182    let mut last_squeezed: u16 = 256;
183
184    loop {
185        let n = fill_buf(reader, &mut inbuf)?;
186        if n == 0 {
187            break;
188        }
189        let mut out_pos = 0;
190        for &b in &inbuf[..n] {
191            if is_member(&delete_set, b) {
192                continue;
193            }
194            if is_member(&squeeze_set, b) {
195                if last_squeezed == b as u16 {
196                    continue;
197                }
198                last_squeezed = b as u16;
199            } else {
200                last_squeezed = 256;
201            }
202            unsafe {
203                *outbuf.get_unchecked_mut(out_pos) = b;
204            }
205            out_pos += 1;
206        }
207        writer.write_all(&outbuf[..out_pos])?;
208    }
209    Ok(())
210}
211
212pub fn squeeze(
213    squeeze_chars: &[u8],
214    reader: &mut impl Read,
215    writer: &mut impl Write,
216) -> io::Result<()> {
217    let member = build_member_set(squeeze_chars);
218    let mut outbuf = vec![0u8; BUF_SIZE];
219    let mut inbuf = vec![0u8; BUF_SIZE];
220    let mut last_squeezed: u16 = 256;
221
222    loop {
223        let n = fill_buf(reader, &mut inbuf)?;
224        if n == 0 {
225            break;
226        }
227        let mut out_pos = 0;
228        for &b in &inbuf[..n] {
229            if is_member(&member, b) {
230                if last_squeezed == b as u16 {
231                    continue;
232                }
233                last_squeezed = b as u16;
234            } else {
235                last_squeezed = 256;
236            }
237            unsafe {
238                *outbuf.get_unchecked_mut(out_pos) = b;
239            }
240            out_pos += 1;
241        }
242        writer.write_all(&outbuf[..out_pos])?;
243    }
244    Ok(())
245}
246
247// ============================================================================
248// Mmap-based functions (zero-copy input from byte slice)
249// ============================================================================
250
251/// Translate bytes from an mmap'd byte slice — zero syscall reads.
252pub fn translate_mmap(
253    set1: &[u8],
254    set2: &[u8],
255    data: &[u8],
256    writer: &mut impl Write,
257) -> io::Result<()> {
258    let table = build_translate_table(set1, set2);
259    let mut out = vec![0u8; BUF_SIZE];
260    for chunk in data.chunks(BUF_SIZE) {
261        translate_chunk(chunk, &mut out[..chunk.len()], &table);
262        writer.write_all(&out[..chunk.len()])?;
263    }
264    Ok(())
265}
266
267/// Translate + squeeze from mmap'd byte slice.
268pub fn translate_squeeze_mmap(
269    set1: &[u8],
270    set2: &[u8],
271    data: &[u8],
272    writer: &mut impl Write,
273) -> io::Result<()> {
274    let table = build_translate_table(set1, set2);
275    let squeeze_set = build_member_set(set2);
276    let mut outbuf = vec![0u8; BUF_SIZE];
277    let mut last_squeezed: u16 = 256;
278
279    for chunk in data.chunks(BUF_SIZE) {
280        let mut out_pos = 0;
281        for &b in chunk {
282            let translated = unsafe { *table.get_unchecked(b as usize) };
283            if is_member(&squeeze_set, translated) {
284                if last_squeezed == translated as u16 {
285                    continue;
286                }
287                last_squeezed = translated as u16;
288            } else {
289                last_squeezed = 256;
290            }
291            unsafe {
292                *outbuf.get_unchecked_mut(out_pos) = translated;
293            }
294            out_pos += 1;
295        }
296        writer.write_all(&outbuf[..out_pos])?;
297    }
298    Ok(())
299}
300
301/// Delete from mmap'd byte slice.
302pub fn delete_mmap(delete_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
303    let member = build_member_set(delete_chars);
304    let mut outbuf = vec![0u8; BUF_SIZE];
305
306    for chunk in data.chunks(BUF_SIZE) {
307        let mut out_pos = 0;
308        for &b in chunk {
309            if !is_member(&member, b) {
310                unsafe {
311                    *outbuf.get_unchecked_mut(out_pos) = b;
312                }
313                out_pos += 1;
314            }
315        }
316        writer.write_all(&outbuf[..out_pos])?;
317    }
318    Ok(())
319}
320
321/// Delete + squeeze from mmap'd byte slice.
322pub fn delete_squeeze_mmap(
323    delete_chars: &[u8],
324    squeeze_chars: &[u8],
325    data: &[u8],
326    writer: &mut impl Write,
327) -> io::Result<()> {
328    let delete_set = build_member_set(delete_chars);
329    let squeeze_set = build_member_set(squeeze_chars);
330    let mut outbuf = vec![0u8; BUF_SIZE];
331    let mut last_squeezed: u16 = 256;
332
333    for chunk in data.chunks(BUF_SIZE) {
334        let mut out_pos = 0;
335        for &b in chunk {
336            if is_member(&delete_set, b) {
337                continue;
338            }
339            if is_member(&squeeze_set, b) {
340                if last_squeezed == b as u16 {
341                    continue;
342                }
343                last_squeezed = b as u16;
344            } else {
345                last_squeezed = 256;
346            }
347            unsafe {
348                *outbuf.get_unchecked_mut(out_pos) = b;
349            }
350            out_pos += 1;
351        }
352        writer.write_all(&outbuf[..out_pos])?;
353    }
354    Ok(())
355}
356
357/// Squeeze from mmap'd byte slice.
358pub fn squeeze_mmap(squeeze_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
359    let member = build_member_set(squeeze_chars);
360    let mut outbuf = vec![0u8; BUF_SIZE];
361    let mut last_squeezed: u16 = 256;
362
363    for chunk in data.chunks(BUF_SIZE) {
364        let mut out_pos = 0;
365        for &b in chunk {
366            if is_member(&member, b) {
367                if last_squeezed == b as u16 {
368                    continue;
369                }
370                last_squeezed = b as u16;
371            } else {
372                last_squeezed = 256;
373            }
374            unsafe {
375                *outbuf.get_unchecked_mut(out_pos) = b;
376            }
377            out_pos += 1;
378        }
379        writer.write_all(&outbuf[..out_pos])?;
380    }
381    Ok(())
382}