Skip to main content

coreutils_rs/tac/
core.rs

1use std::io::{self, IoSlice, Write};
2
3/// Maximum number of iovecs per writev() call (Linux IOV_MAX is 1024).
4const IOV_BATCH: usize = 1024;
5
6/// Write all IoSlices to the writer, handling partial writes.
7/// For large numbers of slices, batches into IOV_BATCH-sized groups.
8fn write_all_slices(out: &mut impl Write, slices: &[IoSlice<'_>]) -> io::Result<()> {
9    // Small number of slices: use simple write_all for each
10    if slices.len() <= 4 {
11        for s in slices {
12            out.write_all(s)?;
13        }
14        return Ok(());
15    }
16
17    let mut offset = 0;
18    while offset < slices.len() {
19        let end = (offset + IOV_BATCH).min(slices.len());
20        let n = out.write_vectored(&slices[offset..end])?;
21        if n == 0 {
22            return Err(io::Error::new(
23                io::ErrorKind::WriteZero,
24                "failed to write any data",
25            ));
26        }
27        let mut remaining = n;
28        while offset < end && remaining >= slices[offset].len() {
29            remaining -= slices[offset].len();
30            offset += 1;
31        }
32        if remaining > 0 && offset < end {
33            out.write_all(&slices[offset][remaining..])?;
34            offset += 1;
35        }
36    }
37    Ok(())
38}
39
40/// Reverse records separated by a single byte.
41/// Uses a single forward SIMD scan to collect all separator positions,
42/// then builds IoSlice references in reverse order for zero-copy output.
43/// Eliminates per-record memrchr calls AND the copy to an output buffer.
44pub fn tac_bytes(data: &[u8], separator: u8, before: bool, out: &mut impl Write) -> io::Result<()> {
45    if data.is_empty() {
46        return Ok(());
47    }
48
49    // For small data, use simple backward scan with direct writes
50    if data.len() < 64 * 1024 {
51        return tac_bytes_small(data, separator, before, out);
52    }
53
54    // Single forward SIMD scan — O(n) with memchr auto-vectorization
55    let positions: Vec<usize> = memchr::memchr_iter(separator, data).collect();
56
57    if positions.is_empty() {
58        return out.write_all(data);
59    }
60
61    let mut slices: Vec<IoSlice<'_>> = Vec::with_capacity(positions.len() + 2);
62
63    if !before {
64        // separator-after mode: records end with separator
65        let last_pos = *positions.last().unwrap();
66
67        // Trailing content without separator — output first
68        if last_pos < data.len() - 1 {
69            slices.push(IoSlice::new(&data[last_pos + 1..]));
70        }
71
72        // Records in reverse order (each includes its trailing separator)
73        for i in (0..positions.len()).rev() {
74            let start = if i == 0 { 0 } else { positions[i - 1] + 1 };
75            slices.push(IoSlice::new(&data[start..positions[i] + 1]));
76        }
77    } else {
78        // separator-before mode: records start with separator
79        for i in (0..positions.len()).rev() {
80            let end = if i + 1 < positions.len() {
81                positions[i + 1]
82            } else {
83                data.len()
84            };
85            slices.push(IoSlice::new(&data[positions[i]..end]));
86        }
87
88        // Leading content before first separator
89        if positions[0] > 0 {
90            slices.push(IoSlice::new(&data[..positions[0]]));
91        }
92    }
93
94    write_all_slices(out, &slices)?;
95    Ok(())
96}
97
98/// Small-file path: backward scan with direct write_all calls.
99fn tac_bytes_small(
100    data: &[u8],
101    separator: u8,
102    before: bool,
103    out: &mut impl Write,
104) -> io::Result<()> {
105    if !before {
106        let mut end = data.len();
107
108        // Check for trailing content after last separator
109        if let Some(last_sep) = memchr::memrchr(separator, data) {
110            if last_sep < data.len() - 1 {
111                out.write_all(&data[last_sep + 1..])?;
112                end = last_sep + 1;
113            }
114        } else {
115            out.write_all(data)?;
116            return Ok(());
117        }
118
119        let mut cursor = end;
120        while cursor > 0 {
121            let search_end = cursor - 1;
122            let prev_sep = if search_end > 0 {
123                memchr::memrchr(separator, &data[..search_end])
124            } else {
125                None
126            };
127            let start = match prev_sep {
128                Some(pos) => pos + 1,
129                None => 0,
130            };
131            out.write_all(&data[start..cursor])?;
132            cursor = start;
133        }
134    } else {
135        let mut cursor = data.len();
136        while cursor > 0 {
137            match memchr::memrchr(separator, &data[..cursor]) {
138                Some(pos) => {
139                    out.write_all(&data[pos..cursor])?;
140                    cursor = pos;
141                }
142                None => {
143                    out.write_all(&data[..cursor])?;
144                    break;
145                }
146            }
147        }
148    }
149    Ok(())
150}
151
152/// Reverse records using a multi-byte string separator.
153/// Uses SIMD-accelerated memmem for substring search.
154pub fn tac_string_separator(
155    data: &[u8],
156    separator: &[u8],
157    before: bool,
158    out: &mut impl Write,
159) -> io::Result<()> {
160    if data.is_empty() {
161        return Ok(());
162    }
163
164    if separator.len() == 1 {
165        return tac_bytes(data, separator[0], before, out);
166    }
167
168    // Find all occurrences of the separator using SIMD-accelerated memmem
169    let positions: Vec<usize> = memchr::memmem::find_iter(data, separator).collect();
170
171    if positions.is_empty() {
172        out.write_all(data)?;
173        return Ok(());
174    }
175
176    let sep_len = separator.len();
177
178    if !before {
179        let last_end = positions.last().unwrap() + sep_len;
180        let has_trailing_sep = last_end == data.len();
181        let mut slices: Vec<IoSlice<'_>> = Vec::with_capacity(positions.len() + 4);
182
183        // Trailing content without separator — output as-is
184        if !has_trailing_sep {
185            slices.push(IoSlice::new(&data[last_end..]));
186        }
187
188        let mut i = positions.len();
189        while i > 0 {
190            i -= 1;
191            let sep_start = positions[i];
192            let rec_start = if i == 0 {
193                0
194            } else {
195                positions[i - 1] + sep_len
196            };
197            slices.push(IoSlice::new(&data[rec_start..sep_start + sep_len]));
198        }
199
200        write_all_slices(out, &slices)?;
201    } else {
202        let mut slices: Vec<IoSlice<'_>> = Vec::with_capacity(positions.len() + 2);
203
204        let mut i = positions.len();
205        while i > 0 {
206            i -= 1;
207            let start = positions[i];
208            let end = if i + 1 < positions.len() {
209                positions[i + 1]
210            } else {
211                data.len()
212            };
213            slices.push(IoSlice::new(&data[start..end]));
214        }
215
216        if positions[0] > 0 {
217            slices.push(IoSlice::new(&data[..positions[0]]));
218        }
219
220        write_all_slices(out, &slices)?;
221    }
222
223    Ok(())
224}
225
226/// Find regex matches using backward scanning, matching GNU tac's re_search behavior.
227/// GNU tac scans backward from the end, finding the rightmost starting position first.
228/// This produces different matches than forward scanning for patterns like [0-9]+.
229/// The matches are returned in left-to-right order.
230fn find_regex_matches_backward(data: &[u8], re: &regex::bytes::Regex) -> Vec<(usize, usize)> {
231    let mut matches = Vec::new();
232    let mut past_end = data.len();
233
234    while past_end > 0 {
235        let buf = &data[..past_end];
236        let mut found = false;
237
238        // Scan backward: try positions from past_end-1 down to 0
239        // We need the LAST match starting position in buf, so we try from the end
240        let mut pos = past_end;
241        while pos > 0 {
242            pos -= 1;
243            if let Some(m) = re.find_at(buf, pos) {
244                if m.start() == pos {
245                    // Match starts at exactly this position — this is the rightmost match start
246                    matches.push((m.start(), m.end()));
247                    past_end = m.start();
248                    found = true;
249                    break;
250                }
251                // Match starts later than pos — skip to before that match
252                // No point checking positions between pos and m.start() since
253                // find_at already told us the leftmost match from pos starts at m.start()
254                // But we need matches that START before m.start(), so continue decrementing
255            }
256            // If None, there's no match at pos or later, but there might be one earlier
257            // (find_at only searches forward from pos)
258        }
259
260        if !found {
261            break;
262        }
263    }
264
265    matches.reverse(); // Convert from backward order to left-to-right order
266    matches
267}
268
269/// Reverse records using a regex separator.
270/// Uses regex::bytes for direct byte-level matching (no UTF-8 conversion needed).
271/// NOTE: GNU tac uses POSIX Basic Regular Expressions (BRE), so we convert to ERE first.
272/// Uses backward scanning to match GNU tac's re_search behavior.
273pub fn tac_regex_separator(
274    data: &[u8],
275    pattern: &str,
276    before: bool,
277    out: &mut impl Write,
278) -> io::Result<()> {
279    if data.is_empty() {
280        return Ok(());
281    }
282
283    let re = match regex::bytes::Regex::new(pattern) {
284        Ok(r) => r,
285        Err(e) => {
286            return Err(io::Error::new(
287                io::ErrorKind::InvalidInput,
288                format!("invalid regex '{}': {}", pattern, e),
289            ));
290        }
291    };
292
293    // Use backward scanning to match GNU tac's re_search behavior
294    let matches = find_regex_matches_backward(data, &re);
295
296    if matches.is_empty() {
297        out.write_all(data)?;
298        return Ok(());
299    }
300
301    if !before {
302        let last_end = matches.last().unwrap().1;
303        let has_trailing_sep = last_end == data.len();
304        let mut slices: Vec<IoSlice<'_>> = Vec::with_capacity(matches.len() + 4);
305
306        // Trailing content without separator — output as-is
307        if !has_trailing_sep {
308            slices.push(IoSlice::new(&data[last_end..]));
309        }
310
311        let mut i = matches.len();
312        while i > 0 {
313            i -= 1;
314            let rec_start = if i == 0 { 0 } else { matches[i - 1].1 };
315            let rec_end = matches[i].1;
316            slices.push(IoSlice::new(&data[rec_start..rec_end]));
317        }
318
319        write_all_slices(out, &slices)?;
320    } else {
321        let mut slices: Vec<IoSlice<'_>> = Vec::with_capacity(matches.len() + 2);
322
323        let mut i = matches.len();
324        while i > 0 {
325            i -= 1;
326            let start = matches[i].0;
327            let end = if i + 1 < matches.len() {
328                matches[i + 1].0
329            } else {
330                data.len()
331            };
332            slices.push(IoSlice::new(&data[start..end]));
333        }
334
335        if matches[0].0 > 0 {
336            slices.push(IoSlice::new(&data[..matches[0].0]));
337        }
338
339        write_all_slices(out, &slices)?;
340    }
341
342    Ok(())
343}