Skip to main content

coreutils_rs/tac/
core.rs

1use std::io::{self, IoSlice, Write};
2
3/// Maximum number of iovecs per writev() call (Linux IOV_MAX is 1024).
4const IOV_BATCH: usize = 1024;
5
6/// Write all IoSlices to the writer, handling partial writes.
7/// For large numbers of slices, batches into IOV_BATCH-sized groups.
8fn write_all_slices(out: &mut impl Write, slices: &[IoSlice<'_>]) -> io::Result<()> {
9    // Small number of slices: use simple write_all for each
10    if slices.len() <= 4 {
11        for s in slices {
12            out.write_all(s)?;
13        }
14        return Ok(());
15    }
16
17    let mut offset = 0;
18    while offset < slices.len() {
19        let end = (offset + IOV_BATCH).min(slices.len());
20        let n = out.write_vectored(&slices[offset..end])?;
21        if n == 0 {
22            return Err(io::Error::new(
23                io::ErrorKind::WriteZero,
24                "failed to write any data",
25            ));
26        }
27        let mut remaining = n;
28        while offset < end && remaining >= slices[offset].len() {
29            remaining -= slices[offset].len();
30            offset += 1;
31        }
32        if remaining > 0 && offset < end {
33            out.write_all(&slices[offset][remaining..])?;
34            offset += 1;
35        }
36    }
37    Ok(())
38}
39
40/// Reverse records separated by a single byte.
41/// Uses backward SIMD scan (memrchr_iter) to process records from back to front,
42/// building batched IoSlice references for zero-copy writev output.
43/// O(IOV_BATCH) memory — no positions Vec allocation needed at all.
44/// For 100MB/2.5M lines: saves ~20MB positions Vec + ~40MB IoSlice Vec.
45pub fn tac_bytes(data: &[u8], separator: u8, before: bool, out: &mut impl Write) -> io::Result<()> {
46    if data.is_empty() {
47        return Ok(());
48    }
49
50    // For small/medium data (< 16MB), use simple backward scan with direct writes.
51    // A single write_all is faster than writev with many small segments.
52    if data.len() < 16 * 1024 * 1024 {
53        return tac_bytes_small(data, separator, before, out);
54    }
55
56    let mut batch: Vec<IoSlice<'_>> = Vec::with_capacity(IOV_BATCH);
57
58    if !before {
59        // separator-after mode: records end with separator
60        let mut iter = memchr::memrchr_iter(separator, data);
61
62        let first_sep = match iter.next() {
63            Some(pos) => pos,
64            None => return out.write_all(data), // no separator found
65        };
66
67        // Trailing content without separator — output first
68        if first_sep + 1 < data.len() {
69            batch.push(IoSlice::new(&data[first_sep + 1..]));
70        }
71
72        let mut end = first_sep + 1; // exclusive end of current record
73
74        // Process remaining separators from back to front
75        for pos in iter {
76            batch.push(IoSlice::new(&data[pos + 1..end]));
77            end = pos + 1;
78            if batch.len() == IOV_BATCH {
79                write_all_slices(out, &batch)?;
80                batch.clear();
81            }
82        }
83
84        // First record (no separator before it)
85        if end > 0 {
86            batch.push(IoSlice::new(&data[0..end]));
87        }
88    } else {
89        // separator-before mode: records start with separator
90        let mut end = data.len();
91
92        for pos in memchr::memrchr_iter(separator, data) {
93            batch.push(IoSlice::new(&data[pos..end]));
94            end = pos;
95            if batch.len() == IOV_BATCH {
96                write_all_slices(out, &batch)?;
97                batch.clear();
98            }
99        }
100
101        // Leading content before first separator
102        if end > 0 {
103            batch.push(IoSlice::new(&data[0..end]));
104        }
105    }
106
107    if !batch.is_empty() {
108        write_all_slices(out, &batch)?;
109    }
110    Ok(())
111}
112
113/// Small-file path: backward SIMD scan, copy records into contiguous buffer, single write.
114/// Uses memrchr_iter for backward scanning (matches large-path approach).
115/// Avoids positions Vec + IoSlice Vec allocations and writev syscalls.
116/// Single write_all is faster than writev with many small segments for small data.
117fn tac_bytes_small(
118    data: &[u8],
119    separator: u8,
120    before: bool,
121    out: &mut impl Write,
122) -> io::Result<()> {
123    // Use extend_from_slice instead of zero-init + index copy.
124    // Avoids zero-initializing the entire buffer (saves memset overhead).
125    let mut outbuf = Vec::with_capacity(data.len());
126
127    if !before {
128        // separator-after mode: records end with separator
129        let mut iter = memchr::memrchr_iter(separator, data);
130
131        let first_sep = match iter.next() {
132            Some(pos) => pos,
133            None => return out.write_all(data),
134        };
135
136        // Trailing content without separator — output first
137        if first_sep + 1 < data.len() {
138            outbuf.extend_from_slice(&data[first_sep + 1..]);
139        }
140
141        let mut end = first_sep + 1;
142
143        for pos in iter {
144            outbuf.extend_from_slice(&data[pos + 1..end]);
145            end = pos + 1;
146        }
147
148        // First record
149        if end > 0 {
150            outbuf.extend_from_slice(&data[0..end]);
151        }
152    } else {
153        // separator-before mode: records start with separator
154        let mut end = data.len();
155
156        for pos in memchr::memrchr_iter(separator, data) {
157            outbuf.extend_from_slice(&data[pos..end]);
158            end = pos;
159        }
160
161        // Leading content before first separator
162        if end > 0 {
163            outbuf.extend_from_slice(&data[0..end]);
164        }
165    }
166
167    out.write_all(&outbuf)
168}
169
170/// Reverse records using a multi-byte string separator.
171/// Uses SIMD-accelerated memmem for substring search.
172pub fn tac_string_separator(
173    data: &[u8],
174    separator: &[u8],
175    before: bool,
176    out: &mut impl Write,
177) -> io::Result<()> {
178    if data.is_empty() {
179        return Ok(());
180    }
181
182    if separator.len() == 1 {
183        return tac_bytes(data, separator[0], before, out);
184    }
185
186    // Find all occurrences of the separator using SIMD-accelerated memmem
187    let estimated = (data.len() / separator.len().max(40)).max(64);
188    let mut positions: Vec<usize> = Vec::with_capacity(estimated);
189    for pos in memchr::memmem::find_iter(data, separator) {
190        positions.push(pos);
191    }
192
193    if positions.is_empty() {
194        out.write_all(data)?;
195        return Ok(());
196    }
197
198    let sep_len = separator.len();
199
200    // Small data: contiguous buffer + single write (avoids IoSlice/writev overhead)
201    if data.len() < 16 * 1024 * 1024 {
202        let mut outbuf = Vec::with_capacity(data.len());
203
204        if !before {
205            let last_end = positions.last().unwrap() + sep_len;
206
207            if last_end < data.len() {
208                outbuf.extend_from_slice(&data[last_end..]);
209            }
210
211            let mut i = positions.len();
212            while i > 0 {
213                i -= 1;
214                let sep_start = positions[i];
215                let rec_start = if i == 0 {
216                    0
217                } else {
218                    positions[i - 1] + sep_len
219                };
220                outbuf.extend_from_slice(&data[rec_start..sep_start + sep_len]);
221            }
222        } else {
223            let mut i = positions.len();
224            while i > 0 {
225                i -= 1;
226                let start = positions[i];
227                let end = if i + 1 < positions.len() {
228                    positions[i + 1]
229                } else {
230                    data.len()
231                };
232                outbuf.extend_from_slice(&data[start..end]);
233            }
234            if positions[0] > 0 {
235                outbuf.extend_from_slice(&data[..positions[0]]);
236            }
237        }
238        return out.write_all(&outbuf);
239    }
240
241    // Large data: batched IoSlice/writev for zero-copy output
242    let mut batch: Vec<IoSlice<'_>> = Vec::with_capacity(IOV_BATCH);
243
244    if !before {
245        let last_end = positions.last().unwrap() + sep_len;
246        let has_trailing_sep = last_end == data.len();
247
248        if !has_trailing_sep {
249            batch.push(IoSlice::new(&data[last_end..]));
250        }
251
252        let mut i = positions.len();
253        while i > 0 {
254            i -= 1;
255            let sep_start = positions[i];
256            let rec_start = if i == 0 {
257                0
258            } else {
259                positions[i - 1] + sep_len
260            };
261            batch.push(IoSlice::new(&data[rec_start..sep_start + sep_len]));
262            if batch.len() == IOV_BATCH {
263                write_all_slices(out, &batch)?;
264                batch.clear();
265            }
266        }
267    } else {
268        let mut i = positions.len();
269        while i > 0 {
270            i -= 1;
271            let start = positions[i];
272            let end = if i + 1 < positions.len() {
273                positions[i + 1]
274            } else {
275                data.len()
276            };
277            batch.push(IoSlice::new(&data[start..end]));
278            if batch.len() == IOV_BATCH {
279                write_all_slices(out, &batch)?;
280                batch.clear();
281            }
282        }
283
284        if positions[0] > 0 {
285            batch.push(IoSlice::new(&data[..positions[0]]));
286        }
287    }
288
289    if !batch.is_empty() {
290        write_all_slices(out, &batch)?;
291    }
292
293    Ok(())
294}
295
296/// Find regex matches using backward scanning, matching GNU tac's re_search behavior.
297/// GNU tac scans backward from the end, finding the rightmost starting position first.
298/// This produces different matches than forward scanning for patterns like [0-9]+.
299/// The matches are returned in left-to-right order.
300fn find_regex_matches_backward(data: &[u8], re: &regex::bytes::Regex) -> Vec<(usize, usize)> {
301    let mut matches = Vec::new();
302    let mut past_end = data.len();
303
304    while past_end > 0 {
305        let buf = &data[..past_end];
306        let mut found = false;
307
308        // Scan backward: try positions from past_end-1 down to 0
309        // We need the LAST match starting position in buf, so we try from the end
310        let mut pos = past_end;
311        while pos > 0 {
312            pos -= 1;
313            if let Some(m) = re.find_at(buf, pos) {
314                if m.start() == pos {
315                    // Match starts at exactly this position — this is the rightmost match start
316                    matches.push((m.start(), m.end()));
317                    past_end = m.start();
318                    found = true;
319                    break;
320                }
321                // Match starts later than pos — skip to before that match
322                // No point checking positions between pos and m.start() since
323                // find_at already told us the leftmost match from pos starts at m.start()
324                // But we need matches that START before m.start(), so continue decrementing
325            }
326            // If None, there's no match at pos or later, but there might be one earlier
327            // (find_at only searches forward from pos)
328        }
329
330        if !found {
331            break;
332        }
333    }
334
335    matches.reverse(); // Convert from backward order to left-to-right order
336    matches
337}
338
339/// Reverse records using a regex separator.
340/// Uses regex::bytes for direct byte-level matching (no UTF-8 conversion needed).
341/// NOTE: GNU tac uses POSIX Basic Regular Expressions (BRE), so we convert to ERE first.
342/// Uses backward scanning to match GNU tac's re_search behavior.
343pub fn tac_regex_separator(
344    data: &[u8],
345    pattern: &str,
346    before: bool,
347    out: &mut impl Write,
348) -> io::Result<()> {
349    if data.is_empty() {
350        return Ok(());
351    }
352
353    let re = match regex::bytes::Regex::new(pattern) {
354        Ok(r) => r,
355        Err(e) => {
356            return Err(io::Error::new(
357                io::ErrorKind::InvalidInput,
358                format!("invalid regex '{}': {}", pattern, e),
359            ));
360        }
361    };
362
363    // Use backward scanning to match GNU tac's re_search behavior
364    let matches = find_regex_matches_backward(data, &re);
365
366    if matches.is_empty() {
367        out.write_all(data)?;
368        return Ok(());
369    }
370
371    // Small data: contiguous buffer + single write (avoids IoSlice/writev overhead)
372    if data.len() < 16 * 1024 * 1024 {
373        let mut outbuf = Vec::with_capacity(data.len());
374
375        if !before {
376            let last_end = matches.last().unwrap().1;
377
378            if last_end < data.len() {
379                outbuf.extend_from_slice(&data[last_end..]);
380            }
381
382            let mut i = matches.len();
383            while i > 0 {
384                i -= 1;
385                let rec_start = if i == 0 { 0 } else { matches[i - 1].1 };
386                outbuf.extend_from_slice(&data[rec_start..matches[i].1]);
387            }
388        } else {
389            let mut i = matches.len();
390            while i > 0 {
391                i -= 1;
392                let start = matches[i].0;
393                let end = if i + 1 < matches.len() {
394                    matches[i + 1].0
395                } else {
396                    data.len()
397                };
398                outbuf.extend_from_slice(&data[start..end]);
399            }
400            if matches[0].0 > 0 {
401                outbuf.extend_from_slice(&data[..matches[0].0]);
402            }
403        }
404        return out.write_all(&outbuf);
405    }
406
407    // Large data: batched IoSlice/writev for zero-copy output
408    let mut batch: Vec<IoSlice<'_>> = Vec::with_capacity(IOV_BATCH);
409
410    if !before {
411        let last_end = matches.last().unwrap().1;
412        let has_trailing_sep = last_end == data.len();
413
414        if !has_trailing_sep {
415            batch.push(IoSlice::new(&data[last_end..]));
416        }
417
418        let mut i = matches.len();
419        while i > 0 {
420            i -= 1;
421            let rec_start = if i == 0 { 0 } else { matches[i - 1].1 };
422            let rec_end = matches[i].1;
423            batch.push(IoSlice::new(&data[rec_start..rec_end]));
424            if batch.len() == IOV_BATCH {
425                write_all_slices(out, &batch)?;
426                batch.clear();
427            }
428        }
429    } else {
430        let mut i = matches.len();
431        while i > 0 {
432            i -= 1;
433            let start = matches[i].0;
434            let end = if i + 1 < matches.len() {
435                matches[i + 1].0
436            } else {
437                data.len()
438            };
439            batch.push(IoSlice::new(&data[start..end]));
440            if batch.len() == IOV_BATCH {
441                write_all_slices(out, &batch)?;
442                batch.clear();
443            }
444        }
445
446        if matches[0].0 > 0 {
447            batch.push(IoSlice::new(&data[..matches[0].0]));
448        }
449    }
450
451    if !batch.is_empty() {
452        write_all_slices(out, &batch)?;
453    }
454
455    Ok(())
456}