Skip to main content

coreutils_rs/tac/
core.rs

1use std::io::{self, IoSlice, Write};
2
3/// Maximum number of iovecs per writev() call (Linux IOV_MAX is 1024).
4const IOV_BATCH: usize = 1024;
5
6/// Maximum records for vectored I/O. Beyond this, use BufWriter to avoid
7/// excessive memory for IoSlice entries (16 bytes each).
8const VECTORED_MAX_RECORDS: usize = 256 * 1024;
9
10/// Write all IoSlices to the writer, handling partial writes.
11/// Batches into IOV_BATCH-sized groups for writev() efficiency.
12fn write_all_slices(out: &mut impl Write, slices: &[IoSlice<'_>]) -> io::Result<()> {
13    let mut offset = 0;
14    while offset < slices.len() {
15        let end = (offset + IOV_BATCH).min(slices.len());
16        let n = out.write_vectored(&slices[offset..end])?;
17        if n == 0 {
18            return Err(io::Error::new(
19                io::ErrorKind::WriteZero,
20                "failed to write any data",
21            ));
22        }
23        // Skip fully-written slices
24        let mut remaining = n;
25        while offset < end && remaining >= slices[offset].len() {
26            remaining -= slices[offset].len();
27            offset += 1;
28        }
29        // Handle partial write within a slice — use write_all for the remainder
30        if remaining > 0 && offset < end {
31            out.write_all(&slices[offset][remaining..])?;
32            offset += 1;
33        }
34    }
35    Ok(())
36}
37
38/// Reverse the records in `data` separated by a single byte `separator` and write to `out`.
39/// If `before` is true, the separator is attached before the record instead of after.
40/// Uses vectored I/O (writev) to write directly from mmap'd data — zero intermediate copies.
41/// Threshold below which we use simple reverse write_all instead of vectored I/O.
42/// For small files (< 100 lines), the overhead of building IoSlice arrays isn't worth it.
43const SIMPLE_THRESHOLD: usize = 100;
44
45pub fn tac_bytes(data: &[u8], separator: u8, before: bool, out: &mut impl Write) -> io::Result<()> {
46    if data.is_empty() {
47        return Ok(());
48    }
49
50    // Forward SIMD scan to collect all separator positions
51    let positions: Vec<usize> = memchr::memchr_iter(separator, data).collect();
52
53    if positions.is_empty() {
54        out.write_all(data)?;
55        return Ok(());
56    }
57
58    // For small files, use simple write_all calls (avoids IoSlice overhead)
59    if positions.len() <= SIMPLE_THRESHOLD {
60        return tac_bytes_simple(data, separator, before, &positions, out);
61    }
62
63    // For files with many records, use BufWriter to avoid excessive IoSlice memory.
64    // For smaller record counts, use vectored I/O for zero-copy writes.
65    if positions.len() > VECTORED_MAX_RECORDS {
66        return tac_bytes_bufwriter(data, separator, before, &positions, out);
67    }
68
69    // Build IoSlice list pointing directly into mmap'd data — zero copies
70    if !before {
71        let has_trailing_sep = *positions.last().unwrap() == data.len() - 1;
72        let mut slices: Vec<IoSlice<'_>> = Vec::with_capacity(positions.len() + 4);
73
74        // Trailing content without separator — output as-is (no separator added)
75        if !has_trailing_sep {
76            let last_sep = *positions.last().unwrap();
77            slices.push(IoSlice::new(&data[last_sep + 1..]));
78        }
79
80        let mut i = positions.len();
81        while i > 0 {
82            i -= 1;
83            let end = positions[i] + 1;
84            let start = if i == 0 { 0 } else { positions[i - 1] + 1 };
85            slices.push(IoSlice::new(&data[start..end]));
86        }
87
88        write_all_slices(out, &slices)?;
89    } else {
90        let mut slices: Vec<IoSlice<'_>> = Vec::with_capacity(positions.len() + 2);
91
92        let mut i = positions.len();
93        while i > 0 {
94            i -= 1;
95            let start = positions[i];
96            let end = if i + 1 < positions.len() {
97                positions[i + 1]
98            } else {
99                data.len()
100            };
101            slices.push(IoSlice::new(&data[start..end]));
102        }
103
104        if positions[0] > 0 {
105            slices.push(IoSlice::new(&data[..positions[0]]));
106        }
107
108        write_all_slices(out, &slices)?;
109    }
110
111    Ok(())
112}
113
114/// Simple reverse-write path for small files — avoids IoSlice allocation overhead.
115/// Uses direct write_all calls with a small staging buffer.
116fn tac_bytes_simple(
117    data: &[u8],
118    _separator: u8,
119    before: bool,
120    positions: &[usize],
121    out: &mut impl Write,
122) -> io::Result<()> {
123    if !before {
124        let has_trailing_sep = *positions.last().unwrap() == data.len() - 1;
125
126        if !has_trailing_sep {
127            let last_sep = *positions.last().unwrap();
128            out.write_all(&data[last_sep + 1..])?;
129        }
130
131        let mut i = positions.len();
132        while i > 0 {
133            i -= 1;
134            let end = positions[i] + 1;
135            let start = if i == 0 { 0 } else { positions[i - 1] + 1 };
136            out.write_all(&data[start..end])?;
137        }
138    } else {
139        let mut i = positions.len();
140        while i > 0 {
141            i -= 1;
142            let start = positions[i];
143            let end = if i + 1 < positions.len() {
144                positions[i + 1]
145            } else {
146                data.len()
147            };
148            out.write_all(&data[start..end])?;
149        }
150        if positions[0] > 0 {
151            out.write_all(&data[..positions[0]])?;
152        }
153    }
154    Ok(())
155}
156
157/// BufWriter fallback for tac_bytes when there are too many records for vectored I/O.
158fn tac_bytes_bufwriter(
159    data: &[u8],
160    _separator: u8,
161    before: bool,
162    positions: &[usize],
163    out: &mut impl Write,
164) -> io::Result<()> {
165    let mut buf = io::BufWriter::with_capacity(4 * 1024 * 1024, out);
166
167    if !before {
168        let has_trailing_sep = *positions.last().unwrap() == data.len() - 1;
169        if !has_trailing_sep {
170            let last_sep = *positions.last().unwrap();
171            buf.write_all(&data[last_sep + 1..])?;
172        }
173        let mut i = positions.len();
174        while i > 0 {
175            i -= 1;
176            let end = positions[i] + 1;
177            let start = if i == 0 { 0 } else { positions[i - 1] + 1 };
178            buf.write_all(&data[start..end])?;
179        }
180    } else {
181        let mut i = positions.len();
182        while i > 0 {
183            i -= 1;
184            let start = positions[i];
185            let end = if i + 1 < positions.len() {
186                positions[i + 1]
187            } else {
188                data.len()
189            };
190            buf.write_all(&data[start..end])?;
191        }
192        if positions[0] > 0 {
193            buf.write_all(&data[..positions[0]])?;
194        }
195    }
196    buf.flush()?;
197    Ok(())
198}
199
200/// Reverse records using a multi-byte string separator.
201/// Uses SIMD-accelerated memmem for substring search.
202pub fn tac_string_separator(
203    data: &[u8],
204    separator: &[u8],
205    before: bool,
206    out: &mut impl Write,
207) -> io::Result<()> {
208    if data.is_empty() {
209        return Ok(());
210    }
211
212    if separator.len() == 1 {
213        return tac_bytes(data, separator[0], before, out);
214    }
215
216    // Find all occurrences of the separator using SIMD-accelerated memmem
217    let positions: Vec<usize> = memchr::memmem::find_iter(data, separator).collect();
218
219    if positions.is_empty() {
220        out.write_all(data)?;
221        return Ok(());
222    }
223
224    let sep_len = separator.len();
225
226    if !before {
227        let last_end = positions.last().unwrap() + sep_len;
228        let has_trailing_sep = last_end == data.len();
229        let mut slices: Vec<IoSlice<'_>> = Vec::with_capacity(positions.len() + 4);
230
231        // Trailing content without separator — output as-is
232        if !has_trailing_sep {
233            slices.push(IoSlice::new(&data[last_end..]));
234        }
235
236        let mut i = positions.len();
237        while i > 0 {
238            i -= 1;
239            let sep_start = positions[i];
240            let rec_start = if i == 0 {
241                0
242            } else {
243                positions[i - 1] + sep_len
244            };
245            slices.push(IoSlice::new(&data[rec_start..sep_start + sep_len]));
246        }
247
248        write_all_slices(out, &slices)?;
249    } else {
250        let mut slices: Vec<IoSlice<'_>> = Vec::with_capacity(positions.len() + 2);
251
252        let mut i = positions.len();
253        while i > 0 {
254            i -= 1;
255            let start = positions[i];
256            let end = if i + 1 < positions.len() {
257                positions[i + 1]
258            } else {
259                data.len()
260            };
261            slices.push(IoSlice::new(&data[start..end]));
262        }
263
264        if positions[0] > 0 {
265            slices.push(IoSlice::new(&data[..positions[0]]));
266        }
267
268        write_all_slices(out, &slices)?;
269    }
270
271    Ok(())
272}
273
274/// Find regex matches using backward scanning, matching GNU tac's re_search behavior.
275/// GNU tac scans backward from the end, finding the rightmost starting position first.
276/// This produces different matches than forward scanning for patterns like [0-9]+.
277/// The matches are returned in left-to-right order.
278fn find_regex_matches_backward(data: &[u8], re: &regex::bytes::Regex) -> Vec<(usize, usize)> {
279    let mut matches = Vec::new();
280    let mut past_end = data.len();
281
282    while past_end > 0 {
283        let buf = &data[..past_end];
284        let mut found = false;
285
286        // Scan backward: try positions from past_end-1 down to 0
287        // We need the LAST match starting position in buf, so we try from the end
288        let mut pos = past_end;
289        while pos > 0 {
290            pos -= 1;
291            if let Some(m) = re.find_at(buf, pos) {
292                if m.start() == pos {
293                    // Match starts at exactly this position — this is the rightmost match start
294                    matches.push((m.start(), m.end()));
295                    past_end = m.start();
296                    found = true;
297                    break;
298                }
299                // Match starts later than pos — skip to before that match
300                // No point checking positions between pos and m.start() since
301                // find_at already told us the leftmost match from pos starts at m.start()
302                // But we need matches that START before m.start(), so continue decrementing
303            }
304            // If None, there's no match at pos or later, but there might be one earlier
305            // (find_at only searches forward from pos)
306        }
307
308        if !found {
309            break;
310        }
311    }
312
313    matches.reverse(); // Convert from backward order to left-to-right order
314    matches
315}
316
317/// Reverse records using a regex separator.
318/// Uses regex::bytes for direct byte-level matching (no UTF-8 conversion needed).
319/// NOTE: GNU tac uses POSIX Basic Regular Expressions (BRE), so we convert to ERE first.
320/// Uses backward scanning to match GNU tac's re_search behavior.
321pub fn tac_regex_separator(
322    data: &[u8],
323    pattern: &str,
324    before: bool,
325    out: &mut impl Write,
326) -> io::Result<()> {
327    if data.is_empty() {
328        return Ok(());
329    }
330
331    let re = match regex::bytes::Regex::new(pattern) {
332        Ok(r) => r,
333        Err(e) => {
334            return Err(io::Error::new(
335                io::ErrorKind::InvalidInput,
336                format!("invalid regex '{}': {}", pattern, e),
337            ));
338        }
339    };
340
341    // Use backward scanning to match GNU tac's re_search behavior
342    let matches = find_regex_matches_backward(data, &re);
343
344    if matches.is_empty() {
345        out.write_all(data)?;
346        return Ok(());
347    }
348
349    if !before {
350        let last_end = matches.last().unwrap().1;
351        let has_trailing_sep = last_end == data.len();
352        let mut slices: Vec<IoSlice<'_>> = Vec::with_capacity(matches.len() + 4);
353
354        // Trailing content without separator — output as-is
355        if !has_trailing_sep {
356            slices.push(IoSlice::new(&data[last_end..]));
357        }
358
359        let mut i = matches.len();
360        while i > 0 {
361            i -= 1;
362            let rec_start = if i == 0 { 0 } else { matches[i - 1].1 };
363            let rec_end = matches[i].1;
364            slices.push(IoSlice::new(&data[rec_start..rec_end]));
365        }
366
367        write_all_slices(out, &slices)?;
368    } else {
369        let mut slices: Vec<IoSlice<'_>> = Vec::with_capacity(matches.len() + 2);
370
371        let mut i = matches.len();
372        while i > 0 {
373            i -= 1;
374            let start = matches[i].0;
375            let end = if i + 1 < matches.len() {
376                matches[i + 1].0
377            } else {
378                data.len()
379            };
380            slices.push(IoSlice::new(&data[start..end]));
381        }
382
383        if matches[0].0 > 0 {
384            slices.push(IoSlice::new(&data[..matches[0].0]));
385        }
386
387        write_all_slices(out, &slices)?;
388    }
389
390    Ok(())
391}