Skip to main content

coreutils_rs/tac/
core.rs

1use std::io::{self, IoSlice, Write};
2
3/// Maximum number of iovecs per writev() call (Linux IOV_MAX is 1024).
4const IOV_BATCH: usize = 1024;
5
6/// Maximum records for vectored I/O. Beyond this, use BufWriter to avoid
7/// excessive memory for IoSlice entries (16 bytes each).
8const VECTORED_MAX_RECORDS: usize = 256 * 1024;
9
10/// Write all IoSlices to the writer, handling partial writes.
11/// Batches into IOV_BATCH-sized groups for writev() efficiency.
12fn write_all_slices(out: &mut impl Write, slices: &[IoSlice<'_>]) -> io::Result<()> {
13    let mut offset = 0;
14    while offset < slices.len() {
15        let end = (offset + IOV_BATCH).min(slices.len());
16        let n = out.write_vectored(&slices[offset..end])?;
17        if n == 0 {
18            return Err(io::Error::new(
19                io::ErrorKind::WriteZero,
20                "failed to write any data",
21            ));
22        }
23        // Skip fully-written slices
24        let mut remaining = n;
25        while offset < end && remaining >= slices[offset].len() {
26            remaining -= slices[offset].len();
27            offset += 1;
28        }
29        // Handle partial write within a slice — use write_all for the remainder
30        if remaining > 0 && offset < end {
31            out.write_all(&slices[offset][remaining..])?;
32            offset += 1;
33        }
34    }
35    Ok(())
36}
37
38/// Reverse the records in `data` separated by a single byte `separator` and write to `out`.
39/// If `before` is true, the separator is attached before the record instead of after.
40/// Uses vectored I/O (writev) to write directly from mmap'd data — zero intermediate copies.
41pub fn tac_bytes(data: &[u8], separator: u8, before: bool, out: &mut impl Write) -> io::Result<()> {
42    if data.is_empty() {
43        return Ok(());
44    }
45
46    // Forward SIMD scan to collect all separator positions
47    let positions: Vec<usize> = memchr::memchr_iter(separator, data).collect();
48
49    if positions.is_empty() {
50        out.write_all(data)?;
51        return Ok(());
52    }
53
54    // For files with many records, use BufWriter to avoid excessive IoSlice memory.
55    // For smaller record counts, use vectored I/O for zero-copy writes.
56    if positions.len() > VECTORED_MAX_RECORDS {
57        return tac_bytes_bufwriter(data, separator, before, &positions, out);
58    }
59
60    // Build IoSlice list pointing directly into mmap'd data — zero copies
61    let sep_byte = [separator];
62
63    if !before {
64        let has_trailing_sep = *positions.last().unwrap() == data.len() - 1;
65        let mut slices: Vec<IoSlice<'_>> = Vec::with_capacity(positions.len() + 4);
66
67        // GNU tac appends separator to trailing content without one
68        if !has_trailing_sep {
69            let last_sep = *positions.last().unwrap();
70            slices.push(IoSlice::new(&data[last_sep + 1..]));
71            slices.push(IoSlice::new(&sep_byte));
72        }
73
74        let mut i = positions.len();
75        while i > 0 {
76            i -= 1;
77            let end = positions[i] + 1;
78            let start = if i == 0 { 0 } else { positions[i - 1] + 1 };
79            slices.push(IoSlice::new(&data[start..end]));
80        }
81
82        write_all_slices(out, &slices)?;
83    } else {
84        let mut slices: Vec<IoSlice<'_>> = Vec::with_capacity(positions.len() + 2);
85
86        let mut i = positions.len();
87        while i > 0 {
88            i -= 1;
89            let start = positions[i];
90            let end = if i + 1 < positions.len() {
91                positions[i + 1]
92            } else {
93                data.len()
94            };
95            slices.push(IoSlice::new(&data[start..end]));
96        }
97
98        if positions[0] > 0 {
99            slices.push(IoSlice::new(&data[..positions[0]]));
100        }
101
102        write_all_slices(out, &slices)?;
103    }
104
105    Ok(())
106}
107
108/// BufWriter fallback for tac_bytes when there are too many records for vectored I/O.
109fn tac_bytes_bufwriter(
110    data: &[u8],
111    separator: u8,
112    before: bool,
113    positions: &[usize],
114    out: &mut impl Write,
115) -> io::Result<()> {
116    let mut buf = io::BufWriter::with_capacity(4 * 1024 * 1024, out);
117
118    if !before {
119        let has_trailing_sep = *positions.last().unwrap() == data.len() - 1;
120        if !has_trailing_sep {
121            let last_sep = *positions.last().unwrap();
122            buf.write_all(&data[last_sep + 1..])?;
123            buf.write_all(&[separator])?;
124        }
125        let mut i = positions.len();
126        while i > 0 {
127            i -= 1;
128            let end = positions[i] + 1;
129            let start = if i == 0 { 0 } else { positions[i - 1] + 1 };
130            buf.write_all(&data[start..end])?;
131        }
132    } else {
133        let mut i = positions.len();
134        while i > 0 {
135            i -= 1;
136            let start = positions[i];
137            let end = if i + 1 < positions.len() {
138                positions[i + 1]
139            } else {
140                data.len()
141            };
142            buf.write_all(&data[start..end])?;
143        }
144        if positions[0] > 0 {
145            buf.write_all(&data[..positions[0]])?;
146        }
147    }
148    buf.flush()?;
149    Ok(())
150}
151
152/// Reverse records using a multi-byte string separator.
153/// Uses SIMD-accelerated memmem for substring search.
154pub fn tac_string_separator(
155    data: &[u8],
156    separator: &[u8],
157    before: bool,
158    out: &mut impl Write,
159) -> io::Result<()> {
160    if data.is_empty() {
161        return Ok(());
162    }
163
164    if separator.len() == 1 {
165        return tac_bytes(data, separator[0], before, out);
166    }
167
168    // Find all occurrences of the separator using SIMD-accelerated memmem
169    let positions: Vec<usize> = memchr::memmem::find_iter(data, separator).collect();
170
171    if positions.is_empty() {
172        out.write_all(data)?;
173        return Ok(());
174    }
175
176    let sep_len = separator.len();
177
178    if !before {
179        let last_end = positions.last().unwrap() + sep_len;
180        let has_trailing_sep = last_end == data.len();
181        let mut slices: Vec<IoSlice<'_>> = Vec::with_capacity(positions.len() + 4);
182
183        // GNU tac appends separator to trailing content without one
184        if !has_trailing_sep {
185            slices.push(IoSlice::new(&data[last_end..]));
186            slices.push(IoSlice::new(separator));
187        }
188
189        let mut i = positions.len();
190        while i > 0 {
191            i -= 1;
192            let sep_start = positions[i];
193            let rec_start = if i == 0 {
194                0
195            } else {
196                positions[i - 1] + sep_len
197            };
198            slices.push(IoSlice::new(&data[rec_start..sep_start + sep_len]));
199        }
200
201        write_all_slices(out, &slices)?;
202    } else {
203        let mut slices: Vec<IoSlice<'_>> = Vec::with_capacity(positions.len() + 2);
204
205        let mut i = positions.len();
206        while i > 0 {
207            i -= 1;
208            let start = positions[i];
209            let end = if i + 1 < positions.len() {
210                positions[i + 1]
211            } else {
212                data.len()
213            };
214            slices.push(IoSlice::new(&data[start..end]));
215        }
216
217        if positions[0] > 0 {
218            slices.push(IoSlice::new(&data[..positions[0]]));
219        }
220
221        write_all_slices(out, &slices)?;
222    }
223
224    Ok(())
225}
226
227/// Find regex matches using backward scanning, matching GNU tac's re_search behavior.
228/// GNU tac scans backward from the end, finding the rightmost starting position first.
229/// This produces different matches than forward scanning for patterns like [0-9]+.
230/// The matches are returned in left-to-right order.
231fn find_regex_matches_backward(data: &[u8], re: &regex::bytes::Regex) -> Vec<(usize, usize)> {
232    let mut matches = Vec::new();
233    let mut past_end = data.len();
234
235    while past_end > 0 {
236        let buf = &data[..past_end];
237        let mut found = false;
238
239        // Scan backward: try positions from past_end-1 down to 0
240        // We need the LAST match starting position in buf, so we try from the end
241        let mut pos = past_end;
242        while pos > 0 {
243            pos -= 1;
244            if let Some(m) = re.find_at(buf, pos) {
245                if m.start() == pos {
246                    // Match starts at exactly this position — this is the rightmost match start
247                    matches.push((m.start(), m.end()));
248                    past_end = m.start();
249                    found = true;
250                    break;
251                }
252                // Match starts later than pos — skip to before that match
253                // No point checking positions between pos and m.start() since
254                // find_at already told us the leftmost match from pos starts at m.start()
255                // But we need matches that START before m.start(), so continue decrementing
256            }
257            // If None, there's no match at pos or later, but there might be one earlier
258            // (find_at only searches forward from pos)
259        }
260
261        if !found {
262            break;
263        }
264    }
265
266    matches.reverse(); // Convert from backward order to left-to-right order
267    matches
268}
269
270/// Reverse records using a regex separator.
271/// Uses regex::bytes for direct byte-level matching (no UTF-8 conversion needed).
272/// NOTE: GNU tac uses POSIX Basic Regular Expressions (BRE), so we convert to ERE first.
273/// Uses backward scanning to match GNU tac's re_search behavior.
274pub fn tac_regex_separator(
275    data: &[u8],
276    pattern: &str,
277    before: bool,
278    out: &mut impl Write,
279) -> io::Result<()> {
280    if data.is_empty() {
281        return Ok(());
282    }
283
284    let re = match regex::bytes::Regex::new(pattern) {
285        Ok(r) => r,
286        Err(e) => {
287            return Err(io::Error::new(
288                io::ErrorKind::InvalidInput,
289                format!("invalid regex '{}': {}", pattern, e),
290            ));
291        }
292    };
293
294    // Use backward scanning to match GNU tac's re_search behavior
295    let matches = find_regex_matches_backward(data, &re);
296
297    if matches.is_empty() {
298        out.write_all(data)?;
299        return Ok(());
300    }
301
302    if !before {
303        let last_end = matches.last().unwrap().1;
304        let has_trailing_sep = last_end == data.len();
305        let mut slices: Vec<IoSlice<'_>> = Vec::with_capacity(matches.len() + 4);
306
307        // GNU tac appends the last separator match to close trailing content
308        if !has_trailing_sep {
309            slices.push(IoSlice::new(&data[last_end..]));
310            let last_match = matches.last().unwrap();
311            slices.push(IoSlice::new(&data[last_match.0..last_match.1]));
312        }
313
314        let mut i = matches.len();
315        while i > 0 {
316            i -= 1;
317            let rec_start = if i == 0 { 0 } else { matches[i - 1].1 };
318            let rec_end = matches[i].1;
319            slices.push(IoSlice::new(&data[rec_start..rec_end]));
320        }
321
322        write_all_slices(out, &slices)?;
323    } else {
324        let mut slices: Vec<IoSlice<'_>> = Vec::with_capacity(matches.len() + 2);
325
326        let mut i = matches.len();
327        while i > 0 {
328            i -= 1;
329            let start = matches[i].0;
330            let end = if i + 1 < matches.len() {
331                matches[i + 1].0
332            } else {
333                data.len()
334            };
335            slices.push(IoSlice::new(&data[start..end]));
336        }
337
338        if matches[0].0 > 0 {
339            slices.push(IoSlice::new(&data[..matches[0].0]));
340        }
341
342        write_all_slices(out, &slices)?;
343    }
344
345    Ok(())
346}