Skip to main content

coreutils_rs/tac/
core.rs

1use std::io::{self, IoSlice, Write};
2
3/// Maximum number of iovecs per writev() call (Linux IOV_MAX is 1024).
4const IOV_BATCH: usize = 1024;
5
6/// Write all IoSlices to the writer, handling partial writes.
7/// For large numbers of slices, batches into IOV_BATCH-sized groups.
8fn write_all_slices(out: &mut impl Write, slices: &[IoSlice<'_>]) -> io::Result<()> {
9    // Small number of slices: use simple write_all for each
10    if slices.len() <= 4 {
11        for s in slices {
12            out.write_all(s)?;
13        }
14        return Ok(());
15    }
16
17    let mut offset = 0;
18    while offset < slices.len() {
19        let end = (offset + IOV_BATCH).min(slices.len());
20        let n = out.write_vectored(&slices[offset..end])?;
21        if n == 0 {
22            return Err(io::Error::new(
23                io::ErrorKind::WriteZero,
24                "failed to write any data",
25            ));
26        }
27        let mut remaining = n;
28        while offset < end && remaining >= slices[offset].len() {
29            remaining -= slices[offset].len();
30            offset += 1;
31        }
32        if remaining > 0 && offset < end {
33            out.write_all(&slices[offset][remaining..])?;
34            offset += 1;
35        }
36    }
37    Ok(())
38}
39
40/// Reverse records separated by a single byte.
41/// Uses a single forward SIMD scan to collect all separator positions,
42/// then builds IoSlice references in reverse order for zero-copy output.
43/// Eliminates per-record memrchr calls AND the copy to an output buffer.
44pub fn tac_bytes(data: &[u8], separator: u8, before: bool, out: &mut impl Write) -> io::Result<()> {
45    if data.is_empty() {
46        return Ok(());
47    }
48
49    // For small data, use simple backward scan with direct writes
50    if data.len() < 64 * 1024 {
51        return tac_bytes_small(data, separator, before, out);
52    }
53
54    // Pre-estimate line count to avoid Vec reallocation.
55    // Conservative estimate: ~40 bytes per line for typical text.
56    let estimated_lines = (data.len() / 40).max(64);
57
58    // Single forward SIMD scan — O(n) with memchr auto-vectorization
59    let mut positions: Vec<usize> = Vec::with_capacity(estimated_lines);
60    for pos in memchr::memchr_iter(separator, data) {
61        positions.push(pos);
62    }
63
64    if positions.is_empty() {
65        return out.write_all(data);
66    }
67
68    let mut slices: Vec<IoSlice<'_>> = Vec::with_capacity(positions.len() + 2);
69
70    if !before {
71        // separator-after mode: records end with separator
72        let last_pos = *positions.last().unwrap();
73
74        // Trailing content without separator — output first
75        if last_pos < data.len() - 1 {
76            slices.push(IoSlice::new(&data[last_pos + 1..]));
77        }
78
79        // Records in reverse order (each includes its trailing separator)
80        for i in (0..positions.len()).rev() {
81            let start = if i == 0 { 0 } else { positions[i - 1] + 1 };
82            slices.push(IoSlice::new(&data[start..positions[i] + 1]));
83        }
84    } else {
85        // separator-before mode: records start with separator
86        for i in (0..positions.len()).rev() {
87            let end = if i + 1 < positions.len() {
88                positions[i + 1]
89            } else {
90                data.len()
91            };
92            slices.push(IoSlice::new(&data[positions[i]..end]));
93        }
94
95        // Leading content before first separator
96        if positions[0] > 0 {
97            slices.push(IoSlice::new(&data[..positions[0]]));
98        }
99    }
100
101    write_all_slices(out, &slices)?;
102    Ok(())
103}
104
105/// Small-file path: backward scan with direct write_all calls.
106fn tac_bytes_small(
107    data: &[u8],
108    separator: u8,
109    before: bool,
110    out: &mut impl Write,
111) -> io::Result<()> {
112    if !before {
113        let mut end = data.len();
114
115        // Check for trailing content after last separator
116        if let Some(last_sep) = memchr::memrchr(separator, data) {
117            if last_sep < data.len() - 1 {
118                out.write_all(&data[last_sep + 1..])?;
119                end = last_sep + 1;
120            }
121        } else {
122            out.write_all(data)?;
123            return Ok(());
124        }
125
126        let mut cursor = end;
127        while cursor > 0 {
128            let search_end = cursor - 1;
129            let prev_sep = if search_end > 0 {
130                memchr::memrchr(separator, &data[..search_end])
131            } else {
132                None
133            };
134            let start = match prev_sep {
135                Some(pos) => pos + 1,
136                None => 0,
137            };
138            out.write_all(&data[start..cursor])?;
139            cursor = start;
140        }
141    } else {
142        let mut cursor = data.len();
143        while cursor > 0 {
144            match memchr::memrchr(separator, &data[..cursor]) {
145                Some(pos) => {
146                    out.write_all(&data[pos..cursor])?;
147                    cursor = pos;
148                }
149                None => {
150                    out.write_all(&data[..cursor])?;
151                    break;
152                }
153            }
154        }
155    }
156    Ok(())
157}
158
159/// Reverse records using a multi-byte string separator.
160/// Uses SIMD-accelerated memmem for substring search.
161pub fn tac_string_separator(
162    data: &[u8],
163    separator: &[u8],
164    before: bool,
165    out: &mut impl Write,
166) -> io::Result<()> {
167    if data.is_empty() {
168        return Ok(());
169    }
170
171    if separator.len() == 1 {
172        return tac_bytes(data, separator[0], before, out);
173    }
174
175    // Find all occurrences of the separator using SIMD-accelerated memmem
176    let estimated = (data.len() / separator.len().max(40)).max(64);
177    let mut positions: Vec<usize> = Vec::with_capacity(estimated);
178    for pos in memchr::memmem::find_iter(data, separator) {
179        positions.push(pos);
180    }
181
182    if positions.is_empty() {
183        out.write_all(data)?;
184        return Ok(());
185    }
186
187    let sep_len = separator.len();
188
189    if !before {
190        let last_end = positions.last().unwrap() + sep_len;
191        let has_trailing_sep = last_end == data.len();
192        let mut slices: Vec<IoSlice<'_>> = Vec::with_capacity(positions.len() + 4);
193
194        // Trailing content without separator — output as-is
195        if !has_trailing_sep {
196            slices.push(IoSlice::new(&data[last_end..]));
197        }
198
199        let mut i = positions.len();
200        while i > 0 {
201            i -= 1;
202            let sep_start = positions[i];
203            let rec_start = if i == 0 {
204                0
205            } else {
206                positions[i - 1] + sep_len
207            };
208            slices.push(IoSlice::new(&data[rec_start..sep_start + sep_len]));
209        }
210
211        write_all_slices(out, &slices)?;
212    } else {
213        let mut slices: Vec<IoSlice<'_>> = Vec::with_capacity(positions.len() + 2);
214
215        let mut i = positions.len();
216        while i > 0 {
217            i -= 1;
218            let start = positions[i];
219            let end = if i + 1 < positions.len() {
220                positions[i + 1]
221            } else {
222                data.len()
223            };
224            slices.push(IoSlice::new(&data[start..end]));
225        }
226
227        if positions[0] > 0 {
228            slices.push(IoSlice::new(&data[..positions[0]]));
229        }
230
231        write_all_slices(out, &slices)?;
232    }
233
234    Ok(())
235}
236
237/// Find regex matches using backward scanning, matching GNU tac's re_search behavior.
238/// GNU tac scans backward from the end, finding the rightmost starting position first.
239/// This produces different matches than forward scanning for patterns like [0-9]+.
240/// The matches are returned in left-to-right order.
241fn find_regex_matches_backward(data: &[u8], re: &regex::bytes::Regex) -> Vec<(usize, usize)> {
242    let mut matches = Vec::new();
243    let mut past_end = data.len();
244
245    while past_end > 0 {
246        let buf = &data[..past_end];
247        let mut found = false;
248
249        // Scan backward: try positions from past_end-1 down to 0
250        // We need the LAST match starting position in buf, so we try from the end
251        let mut pos = past_end;
252        while pos > 0 {
253            pos -= 1;
254            if let Some(m) = re.find_at(buf, pos) {
255                if m.start() == pos {
256                    // Match starts at exactly this position — this is the rightmost match start
257                    matches.push((m.start(), m.end()));
258                    past_end = m.start();
259                    found = true;
260                    break;
261                }
262                // Match starts later than pos — skip to before that match
263                // No point checking positions between pos and m.start() since
264                // find_at already told us the leftmost match from pos starts at m.start()
265                // But we need matches that START before m.start(), so continue decrementing
266            }
267            // If None, there's no match at pos or later, but there might be one earlier
268            // (find_at only searches forward from pos)
269        }
270
271        if !found {
272            break;
273        }
274    }
275
276    matches.reverse(); // Convert from backward order to left-to-right order
277    matches
278}
279
280/// Reverse records using a regex separator.
281/// Uses regex::bytes for direct byte-level matching (no UTF-8 conversion needed).
282/// NOTE: GNU tac uses POSIX Basic Regular Expressions (BRE), so we convert to ERE first.
283/// Uses backward scanning to match GNU tac's re_search behavior.
284pub fn tac_regex_separator(
285    data: &[u8],
286    pattern: &str,
287    before: bool,
288    out: &mut impl Write,
289) -> io::Result<()> {
290    if data.is_empty() {
291        return Ok(());
292    }
293
294    let re = match regex::bytes::Regex::new(pattern) {
295        Ok(r) => r,
296        Err(e) => {
297            return Err(io::Error::new(
298                io::ErrorKind::InvalidInput,
299                format!("invalid regex '{}': {}", pattern, e),
300            ));
301        }
302    };
303
304    // Use backward scanning to match GNU tac's re_search behavior
305    let matches = find_regex_matches_backward(data, &re);
306
307    if matches.is_empty() {
308        out.write_all(data)?;
309        return Ok(());
310    }
311
312    if !before {
313        let last_end = matches.last().unwrap().1;
314        let has_trailing_sep = last_end == data.len();
315        let mut slices: Vec<IoSlice<'_>> = Vec::with_capacity(matches.len() + 4);
316
317        // Trailing content without separator — output as-is
318        if !has_trailing_sep {
319            slices.push(IoSlice::new(&data[last_end..]));
320        }
321
322        let mut i = matches.len();
323        while i > 0 {
324            i -= 1;
325            let rec_start = if i == 0 { 0 } else { matches[i - 1].1 };
326            let rec_end = matches[i].1;
327            slices.push(IoSlice::new(&data[rec_start..rec_end]));
328        }
329
330        write_all_slices(out, &slices)?;
331    } else {
332        let mut slices: Vec<IoSlice<'_>> = Vec::with_capacity(matches.len() + 2);
333
334        let mut i = matches.len();
335        while i > 0 {
336            i -= 1;
337            let start = matches[i].0;
338            let end = if i + 1 < matches.len() {
339                matches[i + 1].0
340            } else {
341                data.len()
342            };
343            slices.push(IoSlice::new(&data[start..end]));
344        }
345
346        if matches[0].0 > 0 {
347            slices.push(IoSlice::new(&data[..matches[0].0]));
348        }
349
350        write_all_slices(out, &slices)?;
351    }
352
353    Ok(())
354}