Skip to main content

coreutils_rs/tac/
core.rs

1use std::io::{self, IoSlice, Write};
2
3/// Maximum number of iovecs per writev() call (Linux IOV_MAX is 1024).
4const IOV_BATCH: usize = 1024;
5
6/// Maximum records for vectored I/O. Beyond this, use BufWriter to avoid
7/// excessive memory for IoSlice entries (16 bytes each).
8const VECTORED_MAX_RECORDS: usize = 256 * 1024;
9
10/// Write all IoSlices to the writer, handling partial writes.
11/// Batches into IOV_BATCH-sized groups for writev() efficiency.
12fn write_all_slices(out: &mut impl Write, slices: &[IoSlice<'_>]) -> io::Result<()> {
13    let mut offset = 0;
14    while offset < slices.len() {
15        let end = (offset + IOV_BATCH).min(slices.len());
16        let n = out.write_vectored(&slices[offset..end])?;
17        if n == 0 {
18            return Err(io::Error::new(
19                io::ErrorKind::WriteZero,
20                "failed to write any data",
21            ));
22        }
23        // Skip fully-written slices
24        let mut remaining = n;
25        while offset < end && remaining >= slices[offset].len() {
26            remaining -= slices[offset].len();
27            offset += 1;
28        }
29        // Handle partial write within a slice — use write_all for the remainder
30        if remaining > 0 && offset < end {
31            out.write_all(&slices[offset][remaining..])?;
32            offset += 1;
33        }
34    }
35    Ok(())
36}
37
38/// Reverse the records in `data` separated by a single byte `separator` and write to `out`.
39/// If `before` is true, the separator is attached before the record instead of after.
40/// Uses vectored I/O (writev) to write directly from mmap'd data — zero intermediate copies.
41/// Threshold below which we use simple reverse write_all instead of vectored I/O.
42/// For small files (< 100 lines), the overhead of building IoSlice arrays isn't worth it.
43const SIMPLE_THRESHOLD: usize = 100;
44
45pub fn tac_bytes(data: &[u8], separator: u8, before: bool, out: &mut impl Write) -> io::Result<()> {
46    if data.is_empty() {
47        return Ok(());
48    }
49
50    // Forward SIMD scan to collect all separator positions
51    let positions: Vec<usize> = memchr::memchr_iter(separator, data).collect();
52
53    if positions.is_empty() {
54        out.write_all(data)?;
55        return Ok(());
56    }
57
58    // For small files, use simple write_all calls (avoids IoSlice overhead)
59    if positions.len() <= SIMPLE_THRESHOLD {
60        return tac_bytes_simple(data, separator, before, &positions, out);
61    }
62
63    // For files with many records, use BufWriter to avoid excessive IoSlice memory.
64    // For smaller record counts, use vectored I/O for zero-copy writes.
65    if positions.len() > VECTORED_MAX_RECORDS {
66        return tac_bytes_bufwriter(data, separator, before, &positions, out);
67    }
68
69    // Build IoSlice list pointing directly into mmap'd data — zero copies
70    let sep_byte = [separator];
71
72    if !before {
73        let has_trailing_sep = *positions.last().unwrap() == data.len() - 1;
74        let mut slices: Vec<IoSlice<'_>> = Vec::with_capacity(positions.len() + 4);
75
76        // GNU tac appends separator to trailing content without one
77        if !has_trailing_sep {
78            let last_sep = *positions.last().unwrap();
79            slices.push(IoSlice::new(&data[last_sep + 1..]));
80            slices.push(IoSlice::new(&sep_byte));
81        }
82
83        let mut i = positions.len();
84        while i > 0 {
85            i -= 1;
86            let end = positions[i] + 1;
87            let start = if i == 0 { 0 } else { positions[i - 1] + 1 };
88            slices.push(IoSlice::new(&data[start..end]));
89        }
90
91        write_all_slices(out, &slices)?;
92    } else {
93        let mut slices: Vec<IoSlice<'_>> = Vec::with_capacity(positions.len() + 2);
94
95        let mut i = positions.len();
96        while i > 0 {
97            i -= 1;
98            let start = positions[i];
99            let end = if i + 1 < positions.len() {
100                positions[i + 1]
101            } else {
102                data.len()
103            };
104            slices.push(IoSlice::new(&data[start..end]));
105        }
106
107        if positions[0] > 0 {
108            slices.push(IoSlice::new(&data[..positions[0]]));
109        }
110
111        write_all_slices(out, &slices)?;
112    }
113
114    Ok(())
115}
116
117/// Simple reverse-write path for small files — avoids IoSlice allocation overhead.
118/// Uses direct write_all calls with a small staging buffer.
119fn tac_bytes_simple(
120    data: &[u8],
121    separator: u8,
122    before: bool,
123    positions: &[usize],
124    out: &mut impl Write,
125) -> io::Result<()> {
126    if !before {
127        let has_trailing_sep = *positions.last().unwrap() == data.len() - 1;
128
129        if !has_trailing_sep {
130            let last_sep = *positions.last().unwrap();
131            out.write_all(&data[last_sep + 1..])?;
132            out.write_all(&[separator])?;
133        }
134
135        let mut i = positions.len();
136        while i > 0 {
137            i -= 1;
138            let end = positions[i] + 1;
139            let start = if i == 0 { 0 } else { positions[i - 1] + 1 };
140            out.write_all(&data[start..end])?;
141        }
142    } else {
143        let mut i = positions.len();
144        while i > 0 {
145            i -= 1;
146            let start = positions[i];
147            let end = if i + 1 < positions.len() {
148                positions[i + 1]
149            } else {
150                data.len()
151            };
152            out.write_all(&data[start..end])?;
153        }
154        if positions[0] > 0 {
155            out.write_all(&data[..positions[0]])?;
156        }
157    }
158    Ok(())
159}
160
161/// BufWriter fallback for tac_bytes when there are too many records for vectored I/O.
162fn tac_bytes_bufwriter(
163    data: &[u8],
164    separator: u8,
165    before: bool,
166    positions: &[usize],
167    out: &mut impl Write,
168) -> io::Result<()> {
169    let mut buf = io::BufWriter::with_capacity(4 * 1024 * 1024, out);
170
171    if !before {
172        let has_trailing_sep = *positions.last().unwrap() == data.len() - 1;
173        if !has_trailing_sep {
174            let last_sep = *positions.last().unwrap();
175            buf.write_all(&data[last_sep + 1..])?;
176            buf.write_all(&[separator])?;
177        }
178        let mut i = positions.len();
179        while i > 0 {
180            i -= 1;
181            let end = positions[i] + 1;
182            let start = if i == 0 { 0 } else { positions[i - 1] + 1 };
183            buf.write_all(&data[start..end])?;
184        }
185    } else {
186        let mut i = positions.len();
187        while i > 0 {
188            i -= 1;
189            let start = positions[i];
190            let end = if i + 1 < positions.len() {
191                positions[i + 1]
192            } else {
193                data.len()
194            };
195            buf.write_all(&data[start..end])?;
196        }
197        if positions[0] > 0 {
198            buf.write_all(&data[..positions[0]])?;
199        }
200    }
201    buf.flush()?;
202    Ok(())
203}
204
205/// Reverse records using a multi-byte string separator.
206/// Uses SIMD-accelerated memmem for substring search.
207pub fn tac_string_separator(
208    data: &[u8],
209    separator: &[u8],
210    before: bool,
211    out: &mut impl Write,
212) -> io::Result<()> {
213    if data.is_empty() {
214        return Ok(());
215    }
216
217    if separator.len() == 1 {
218        return tac_bytes(data, separator[0], before, out);
219    }
220
221    // Find all occurrences of the separator using SIMD-accelerated memmem
222    let positions: Vec<usize> = memchr::memmem::find_iter(data, separator).collect();
223
224    if positions.is_empty() {
225        out.write_all(data)?;
226        return Ok(());
227    }
228
229    let sep_len = separator.len();
230
231    if !before {
232        let last_end = positions.last().unwrap() + sep_len;
233        let has_trailing_sep = last_end == data.len();
234        let mut slices: Vec<IoSlice<'_>> = Vec::with_capacity(positions.len() + 4);
235
236        // GNU tac appends separator to trailing content without one
237        if !has_trailing_sep {
238            slices.push(IoSlice::new(&data[last_end..]));
239            slices.push(IoSlice::new(separator));
240        }
241
242        let mut i = positions.len();
243        while i > 0 {
244            i -= 1;
245            let sep_start = positions[i];
246            let rec_start = if i == 0 {
247                0
248            } else {
249                positions[i - 1] + sep_len
250            };
251            slices.push(IoSlice::new(&data[rec_start..sep_start + sep_len]));
252        }
253
254        write_all_slices(out, &slices)?;
255    } else {
256        let mut slices: Vec<IoSlice<'_>> = Vec::with_capacity(positions.len() + 2);
257
258        let mut i = positions.len();
259        while i > 0 {
260            i -= 1;
261            let start = positions[i];
262            let end = if i + 1 < positions.len() {
263                positions[i + 1]
264            } else {
265                data.len()
266            };
267            slices.push(IoSlice::new(&data[start..end]));
268        }
269
270        if positions[0] > 0 {
271            slices.push(IoSlice::new(&data[..positions[0]]));
272        }
273
274        write_all_slices(out, &slices)?;
275    }
276
277    Ok(())
278}
279
280/// Find regex matches using backward scanning, matching GNU tac's re_search behavior.
281/// GNU tac scans backward from the end, finding the rightmost starting position first.
282/// This produces different matches than forward scanning for patterns like [0-9]+.
283/// The matches are returned in left-to-right order.
284fn find_regex_matches_backward(data: &[u8], re: &regex::bytes::Regex) -> Vec<(usize, usize)> {
285    let mut matches = Vec::new();
286    let mut past_end = data.len();
287
288    while past_end > 0 {
289        let buf = &data[..past_end];
290        let mut found = false;
291
292        // Scan backward: try positions from past_end-1 down to 0
293        // We need the LAST match starting position in buf, so we try from the end
294        let mut pos = past_end;
295        while pos > 0 {
296            pos -= 1;
297            if let Some(m) = re.find_at(buf, pos) {
298                if m.start() == pos {
299                    // Match starts at exactly this position — this is the rightmost match start
300                    matches.push((m.start(), m.end()));
301                    past_end = m.start();
302                    found = true;
303                    break;
304                }
305                // Match starts later than pos — skip to before that match
306                // No point checking positions between pos and m.start() since
307                // find_at already told us the leftmost match from pos starts at m.start()
308                // But we need matches that START before m.start(), so continue decrementing
309            }
310            // If None, there's no match at pos or later, but there might be one earlier
311            // (find_at only searches forward from pos)
312        }
313
314        if !found {
315            break;
316        }
317    }
318
319    matches.reverse(); // Convert from backward order to left-to-right order
320    matches
321}
322
323/// Reverse records using a regex separator.
324/// Uses regex::bytes for direct byte-level matching (no UTF-8 conversion needed).
325/// NOTE: GNU tac uses POSIX Basic Regular Expressions (BRE), so we convert to ERE first.
326/// Uses backward scanning to match GNU tac's re_search behavior.
327pub fn tac_regex_separator(
328    data: &[u8],
329    pattern: &str,
330    before: bool,
331    out: &mut impl Write,
332) -> io::Result<()> {
333    if data.is_empty() {
334        return Ok(());
335    }
336
337    let re = match regex::bytes::Regex::new(pattern) {
338        Ok(r) => r,
339        Err(e) => {
340            return Err(io::Error::new(
341                io::ErrorKind::InvalidInput,
342                format!("invalid regex '{}': {}", pattern, e),
343            ));
344        }
345    };
346
347    // Use backward scanning to match GNU tac's re_search behavior
348    let matches = find_regex_matches_backward(data, &re);
349
350    if matches.is_empty() {
351        out.write_all(data)?;
352        return Ok(());
353    }
354
355    if !before {
356        let last_end = matches.last().unwrap().1;
357        let has_trailing_sep = last_end == data.len();
358        let mut slices: Vec<IoSlice<'_>> = Vec::with_capacity(matches.len() + 4);
359
360        // GNU tac appends the last separator match to close trailing content
361        if !has_trailing_sep {
362            slices.push(IoSlice::new(&data[last_end..]));
363            let last_match = matches.last().unwrap();
364            slices.push(IoSlice::new(&data[last_match.0..last_match.1]));
365        }
366
367        let mut i = matches.len();
368        while i > 0 {
369            i -= 1;
370            let rec_start = if i == 0 { 0 } else { matches[i - 1].1 };
371            let rec_end = matches[i].1;
372            slices.push(IoSlice::new(&data[rec_start..rec_end]));
373        }
374
375        write_all_slices(out, &slices)?;
376    } else {
377        let mut slices: Vec<IoSlice<'_>> = Vec::with_capacity(matches.len() + 2);
378
379        let mut i = matches.len();
380        while i > 0 {
381            i -= 1;
382            let start = matches[i].0;
383            let end = if i + 1 < matches.len() {
384                matches[i + 1].0
385            } else {
386                data.len()
387            };
388            slices.push(IoSlice::new(&data[start..end]));
389        }
390
391        if matches[0].0 > 0 {
392            slices.push(IoSlice::new(&data[..matches[0].0]));
393        }
394
395        write_all_slices(out, &slices)?;
396    }
397
398    Ok(())
399}