Skip to main content

coreutils_rs/tac/
core.rs

1use std::io::{self, IoSlice, Write};
2
3/// Output buffer size for backward-fill strategy.
4const OUT_BUF: usize = 8 * 1024 * 1024;
5
6/// Maximum number of iovecs per writev() call (Linux IOV_MAX is 1024).
7const IOV_BATCH: usize = 1024;
8
9/// Write all IoSlices to the writer, handling partial writes.
10fn write_all_slices(out: &mut impl Write, slices: &[IoSlice<'_>]) -> io::Result<()> {
11    let mut offset = 0;
12    while offset < slices.len() {
13        let end = (offset + IOV_BATCH).min(slices.len());
14        let n = out.write_vectored(&slices[offset..end])?;
15        if n == 0 {
16            return Err(io::Error::new(
17                io::ErrorKind::WriteZero,
18                "failed to write any data",
19            ));
20        }
21        let mut remaining = n;
22        while offset < end && remaining >= slices[offset].len() {
23            remaining -= slices[offset].len();
24            offset += 1;
25        }
26        if remaining > 0 && offset < end {
27            out.write_all(&slices[offset][remaining..])?;
28            offset += 1;
29        }
30    }
31    Ok(())
32}
33
34/// Reverse records separated by a single byte using backward memrchr scanning.
35/// Scans backward from end, writes each record as found. No Vec<usize> allocation.
36/// Uses a backward-fill buffer: records are copied backward into an 8MB buffer,
37/// then flushed as a single write_all when full.
38pub fn tac_bytes(data: &[u8], separator: u8, before: bool, out: &mut impl Write) -> io::Result<()> {
39    if data.is_empty() {
40        return Ok(());
41    }
42
43    // For small data, use simple backward scan with direct writes
44    if data.len() < 64 * 1024 {
45        return tac_bytes_small(data, separator, before, out);
46    }
47
48    // Large data: backward scan with forward-fill output buffer
49    let mut buf = vec![0u8; OUT_BUF];
50    let mut bp = 0; // write position (fills forward)
51
52    if !before {
53        // separator-after mode: records end with separator
54        let mut end = data.len();
55
56        // Check for trailing content after last separator
57        if let Some(last_sep) = memchr::memrchr(separator, data) {
58            if last_sep < data.len() - 1 {
59                // Trailing content without separator — output first
60                let record = &data[last_sep + 1..];
61                if bp + record.len() > OUT_BUF {
62                    out.write_all(&buf[..bp])?;
63                    bp = 0;
64                }
65                if record.len() > OUT_BUF {
66                    out.write_all(record)?;
67                } else {
68                    buf[bp..bp + record.len()].copy_from_slice(record);
69                    bp += record.len();
70                }
71                end = last_sep + 1;
72            }
73        } else {
74            // No separator found — output entire data
75            out.write_all(data)?;
76            return Ok(());
77        }
78
79        // Scan backward through records
80        let mut cursor = end;
81        while cursor > 0 {
82            // Find previous separator
83            let search_end = cursor - 1; // skip separator at cursor-1
84            let prev_sep = if search_end > 0 {
85                memchr::memrchr(separator, &data[..search_end])
86            } else {
87                None
88            };
89
90            let start = match prev_sep {
91                Some(pos) => pos + 1,
92                None => 0,
93            };
94
95            let record = &data[start..cursor];
96            if bp + record.len() > OUT_BUF {
97                out.write_all(&buf[..bp])?;
98                bp = 0;
99            }
100            if record.len() > OUT_BUF {
101                out.write_all(record)?;
102            } else {
103                buf[bp..bp + record.len()].copy_from_slice(record);
104                bp += record.len();
105            }
106
107            cursor = start;
108        }
109    } else {
110        // separator-before mode: records start with separator
111        let mut cursor = data.len();
112        while cursor > 0 {
113            let sep_pos = memchr::memrchr(separator, &data[..cursor]);
114            match sep_pos {
115                Some(pos) => {
116                    let record = &data[pos..cursor];
117                    if bp + record.len() > OUT_BUF {
118                        out.write_all(&buf[..bp])?;
119                        bp = 0;
120                    }
121                    if record.len() > OUT_BUF {
122                        out.write_all(record)?;
123                    } else {
124                        buf[bp..bp + record.len()].copy_from_slice(record);
125                        bp += record.len();
126                    }
127                    cursor = pos;
128                }
129                None => {
130                    // Leading content before first separator
131                    let record = &data[..cursor];
132                    if bp + record.len() > OUT_BUF {
133                        out.write_all(&buf[..bp])?;
134                        bp = 0;
135                    }
136                    if record.len() > OUT_BUF {
137                        out.write_all(record)?;
138                    } else {
139                        buf[bp..bp + record.len()].copy_from_slice(record);
140                        bp += record.len();
141                    }
142                    break;
143                }
144            }
145        }
146    }
147
148    // Flush remaining buffer
149    if bp > 0 {
150        out.write_all(&buf[..bp])?;
151    }
152
153    Ok(())
154}
155
156/// Small-file path: backward scan with direct write_all calls.
157fn tac_bytes_small(
158    data: &[u8],
159    separator: u8,
160    before: bool,
161    out: &mut impl Write,
162) -> io::Result<()> {
163    if !before {
164        let mut end = data.len();
165
166        // Check for trailing content after last separator
167        if let Some(last_sep) = memchr::memrchr(separator, data) {
168            if last_sep < data.len() - 1 {
169                out.write_all(&data[last_sep + 1..])?;
170                end = last_sep + 1;
171            }
172        } else {
173            out.write_all(data)?;
174            return Ok(());
175        }
176
177        let mut cursor = end;
178        while cursor > 0 {
179            let search_end = cursor - 1;
180            let prev_sep = if search_end > 0 {
181                memchr::memrchr(separator, &data[..search_end])
182            } else {
183                None
184            };
185            let start = match prev_sep {
186                Some(pos) => pos + 1,
187                None => 0,
188            };
189            out.write_all(&data[start..cursor])?;
190            cursor = start;
191        }
192    } else {
193        let mut cursor = data.len();
194        while cursor > 0 {
195            match memchr::memrchr(separator, &data[..cursor]) {
196                Some(pos) => {
197                    out.write_all(&data[pos..cursor])?;
198                    cursor = pos;
199                }
200                None => {
201                    out.write_all(&data[..cursor])?;
202                    break;
203                }
204            }
205        }
206    }
207    Ok(())
208}
209
210/// Reverse records using a multi-byte string separator.
211/// Uses SIMD-accelerated memmem for substring search.
212pub fn tac_string_separator(
213    data: &[u8],
214    separator: &[u8],
215    before: bool,
216    out: &mut impl Write,
217) -> io::Result<()> {
218    if data.is_empty() {
219        return Ok(());
220    }
221
222    if separator.len() == 1 {
223        return tac_bytes(data, separator[0], before, out);
224    }
225
226    // Find all occurrences of the separator using SIMD-accelerated memmem
227    let positions: Vec<usize> = memchr::memmem::find_iter(data, separator).collect();
228
229    if positions.is_empty() {
230        out.write_all(data)?;
231        return Ok(());
232    }
233
234    let sep_len = separator.len();
235
236    if !before {
237        let last_end = positions.last().unwrap() + sep_len;
238        let has_trailing_sep = last_end == data.len();
239        let mut slices: Vec<IoSlice<'_>> = Vec::with_capacity(positions.len() + 4);
240
241        // Trailing content without separator — output as-is
242        if !has_trailing_sep {
243            slices.push(IoSlice::new(&data[last_end..]));
244        }
245
246        let mut i = positions.len();
247        while i > 0 {
248            i -= 1;
249            let sep_start = positions[i];
250            let rec_start = if i == 0 {
251                0
252            } else {
253                positions[i - 1] + sep_len
254            };
255            slices.push(IoSlice::new(&data[rec_start..sep_start + sep_len]));
256        }
257
258        write_all_slices(out, &slices)?;
259    } else {
260        let mut slices: Vec<IoSlice<'_>> = Vec::with_capacity(positions.len() + 2);
261
262        let mut i = positions.len();
263        while i > 0 {
264            i -= 1;
265            let start = positions[i];
266            let end = if i + 1 < positions.len() {
267                positions[i + 1]
268            } else {
269                data.len()
270            };
271            slices.push(IoSlice::new(&data[start..end]));
272        }
273
274        if positions[0] > 0 {
275            slices.push(IoSlice::new(&data[..positions[0]]));
276        }
277
278        write_all_slices(out, &slices)?;
279    }
280
281    Ok(())
282}
283
284/// Find regex matches using backward scanning, matching GNU tac's re_search behavior.
285/// GNU tac scans backward from the end, finding the rightmost starting position first.
286/// This produces different matches than forward scanning for patterns like [0-9]+.
287/// The matches are returned in left-to-right order.
288fn find_regex_matches_backward(data: &[u8], re: &regex::bytes::Regex) -> Vec<(usize, usize)> {
289    let mut matches = Vec::new();
290    let mut past_end = data.len();
291
292    while past_end > 0 {
293        let buf = &data[..past_end];
294        let mut found = false;
295
296        // Scan backward: try positions from past_end-1 down to 0
297        // We need the LAST match starting position in buf, so we try from the end
298        let mut pos = past_end;
299        while pos > 0 {
300            pos -= 1;
301            if let Some(m) = re.find_at(buf, pos) {
302                if m.start() == pos {
303                    // Match starts at exactly this position — this is the rightmost match start
304                    matches.push((m.start(), m.end()));
305                    past_end = m.start();
306                    found = true;
307                    break;
308                }
309                // Match starts later than pos — skip to before that match
310                // No point checking positions between pos and m.start() since
311                // find_at already told us the leftmost match from pos starts at m.start()
312                // But we need matches that START before m.start(), so continue decrementing
313            }
314            // If None, there's no match at pos or later, but there might be one earlier
315            // (find_at only searches forward from pos)
316        }
317
318        if !found {
319            break;
320        }
321    }
322
323    matches.reverse(); // Convert from backward order to left-to-right order
324    matches
325}
326
327/// Reverse records using a regex separator.
328/// Uses regex::bytes for direct byte-level matching (no UTF-8 conversion needed).
329/// NOTE: GNU tac uses POSIX Basic Regular Expressions (BRE), so we convert to ERE first.
330/// Uses backward scanning to match GNU tac's re_search behavior.
331pub fn tac_regex_separator(
332    data: &[u8],
333    pattern: &str,
334    before: bool,
335    out: &mut impl Write,
336) -> io::Result<()> {
337    if data.is_empty() {
338        return Ok(());
339    }
340
341    let re = match regex::bytes::Regex::new(pattern) {
342        Ok(r) => r,
343        Err(e) => {
344            return Err(io::Error::new(
345                io::ErrorKind::InvalidInput,
346                format!("invalid regex '{}': {}", pattern, e),
347            ));
348        }
349    };
350
351    // Use backward scanning to match GNU tac's re_search behavior
352    let matches = find_regex_matches_backward(data, &re);
353
354    if matches.is_empty() {
355        out.write_all(data)?;
356        return Ok(());
357    }
358
359    if !before {
360        let last_end = matches.last().unwrap().1;
361        let has_trailing_sep = last_end == data.len();
362        let mut slices: Vec<IoSlice<'_>> = Vec::with_capacity(matches.len() + 4);
363
364        // Trailing content without separator — output as-is
365        if !has_trailing_sep {
366            slices.push(IoSlice::new(&data[last_end..]));
367        }
368
369        let mut i = matches.len();
370        while i > 0 {
371            i -= 1;
372            let rec_start = if i == 0 { 0 } else { matches[i - 1].1 };
373            let rec_end = matches[i].1;
374            slices.push(IoSlice::new(&data[rec_start..rec_end]));
375        }
376
377        write_all_slices(out, &slices)?;
378    } else {
379        let mut slices: Vec<IoSlice<'_>> = Vec::with_capacity(matches.len() + 2);
380
381        let mut i = matches.len();
382        while i > 0 {
383            i -= 1;
384            let start = matches[i].0;
385            let end = if i + 1 < matches.len() {
386                matches[i + 1].0
387            } else {
388                data.len()
389            };
390            slices.push(IoSlice::new(&data[start..end]));
391        }
392
393        if matches[0].0 > 0 {
394            slices.push(IoSlice::new(&data[..matches[0].0]));
395        }
396
397        write_all_slices(out, &slices)?;
398    }
399
400    Ok(())
401}