Skip to main content

coreutils_rs/tac/
core.rs

1use std::io::{self, IoSlice, Write};
2
3/// Max IoSlice entries per write_vectored batch.
4/// Linux UIO_MAXIOV is 1024; we use that as our batch limit.
5const MAX_IOV: usize = 1024;
6
7/// Chunk size for the forward-scan-within-backward-chunks strategy.
8/// 4MB gives better SIMD throughput per memchr_iter call (fewer calls needed,
9/// each processing a larger contiguous region with full AVX2 pipeline).
10/// For a 10MB file with 50-byte lines, 4MB chunks = 3 calls vs 5 at 2MB.
11const CHUNK: usize = 4 * 1024 * 1024;
12
13/// Flush a batch of IoSlice entries using write_vectored.
14/// Batches in chunks of MAX_IOV to respect Linux UIO_MAXIOV.
15#[inline]
16fn flush_iov(out: &mut impl Write, slices: &[IoSlice]) -> io::Result<()> {
17    for batch in slices.chunks(MAX_IOV) {
18        let total: usize = batch.iter().map(|s| s.len()).sum();
19        let written = match out.write_vectored(batch) {
20            Ok(n) if n >= total => continue,
21            Ok(n) => n,
22            Err(e) => return Err(e),
23        };
24        // Slow path: partial write — fall back to write_all per remaining slice
25        let mut skip = written;
26        for slice in batch {
27            let slen = slice.len();
28            if skip >= slen {
29                skip -= slen;
30                continue;
31            }
32            if skip > 0 {
33                out.write_all(&slice[skip..])?;
34                skip = 0;
35            } else {
36                out.write_all(slice)?;
37            }
38        }
39    }
40    Ok(())
41}
42
43/// Reverse records of an owned Vec in-place, then write.
44/// Avoids allocating a second output buffer by using a two-pass approach:
45/// 1. Reverse all bytes in the Vec
46/// 2. Reverse each individual record (between separators)
47/// This produces the same output as copying records in reverse order.
48///
49/// Only works for after-separator mode with single-byte separator.
50pub fn tac_bytes_owned(
51    data: &mut [u8],
52    separator: u8,
53    before: bool,
54    out: &mut impl Write,
55) -> io::Result<()> {
56    if data.is_empty() {
57        return Ok(());
58    }
59    // For before-separator mode or complex cases, fall back to the copy approach
60    if before {
61        return tac_bytes(data, separator, before, out);
62    }
63
64    let len = data.len();
65
66    // Step 1: Reverse the entire buffer
67    data.reverse();
68
69    // Step 2: Reverse each record within the reversed buffer.
70    // After step 1, records are in the right order but each record's bytes are reversed.
71    // We need to reverse each individual record.
72    //
73    // After global reverse, what were "\n" separators at the END of records
74    // are now at the START. So records are [sep][reversed_bytes]...[sep][reversed_bytes][maybe_no_sep]
75    //
76    // Collect separator positions first (memchr SIMD pass), then reverse each segment.
77    let positions: Vec<usize> = memchr::memchr_iter(separator, data).collect();
78    let mut start = 0;
79    for &pos in &positions {
80        if pos > start {
81            data[start..pos].reverse();
82        }
83        start = pos + 1;
84    }
85    // Reverse the last segment (no trailing separator)
86    if start < len {
87        data[start..len].reverse();
88    }
89
90    out.write_all(data)
91}
92
93/// Reverse records separated by a single byte.
94/// Uses single forward SIMD pass to find separators, then builds zero-copy
95/// reversed output via IoSlice entries pointing directly at the source data.
96///
97/// For data up to ZEROCOPY_LIMIT (256MB), builds IoSlice entries for all
98/// records and writes via batched write_vectored. For larger data, falls
99/// back to chunked backward scan to limit memory overhead.
100pub fn tac_bytes(data: &[u8], separator: u8, before: bool, out: &mut impl Write) -> io::Result<()> {
101    if data.is_empty() {
102        return Ok(());
103    }
104    if data.len() <= ZEROCOPY_LIMIT {
105        if !before {
106            tac_bytes_zerocopy_after(data, separator, out)
107        } else {
108            tac_bytes_zerocopy_before(data, separator, out)
109        }
110    } else if !before {
111        tac_bytes_backward_after(data, separator, out)
112    } else {
113        tac_bytes_backward_before(data, separator, out)
114    }
115}
116
117/// Threshold for zero-copy IoSlice strategy.
118/// Up to 256MB, the positions Vec (~50MB for 50-byte lines) + IoSlice Vec
119/// (~100MB) is acceptable. For larger data, use chunked backward scan.
120const ZEROCOPY_LIMIT: usize = 256 * 1024 * 1024;
121
122/// Zero-copy after-separator mode: builds IoSlice entries pointing directly
123/// at the source data in reverse record order. Uses forward SIMD memchr_iter
124/// to find all separator positions, then constructs IoSlice entries in reverse
125/// order and writes them via batched write_vectored.
126///
127/// Eliminates the 10MB output buffer allocation and data copy that the previous
128/// contiguous-buffer approach required. For /dev/null or pipe output, this is
129/// significantly faster since we avoid touching 10MB of output memory entirely.
130fn tac_bytes_zerocopy_after(data: &[u8], sep: u8, out: &mut impl Write) -> io::Result<()> {
131    // Collect separator positions with forward SIMD memchr (single fast pass).
132    // For 10MB with 50-byte lines, this produces ~200K positions.
133    let positions: Vec<usize> = memchr::memchr_iter(sep, data).collect();
134
135    // Build IoSlice entries in reverse order, pointing directly at source data.
136    // Each record is the slice between separators (after the separator byte).
137    let num_records = positions.len() + 1; // +1 for potential leading record
138    let mut iov: Vec<IoSlice> = Vec::with_capacity(num_records);
139
140    let mut end = data.len();
141    for &pos in positions.iter().rev() {
142        let rec_start = pos + 1;
143        if rec_start < end {
144            iov.push(IoSlice::new(&data[rec_start..end]));
145        }
146        end = rec_start;
147    }
148    // First record (before the first separator)
149    if end > 0 {
150        iov.push(IoSlice::new(&data[..end]));
151    }
152
153    flush_iov(out, &iov)
154}
155
156/// Zero-copy before-separator mode: builds IoSlice entries pointing directly
157/// at the source data in reverse record order.
158fn tac_bytes_zerocopy_before(data: &[u8], sep: u8, out: &mut impl Write) -> io::Result<()> {
159    let positions: Vec<usize> = memchr::memchr_iter(sep, data).collect();
160
161    let num_records = positions.len() + 1;
162    let mut iov: Vec<IoSlice> = Vec::with_capacity(num_records);
163
164    let mut end = data.len();
165    for &pos in positions.iter().rev() {
166        if pos < end {
167            iov.push(IoSlice::new(&data[pos..end]));
168        }
169        end = pos;
170    }
171    // First record (before the first separator)
172    if end > 0 {
173        iov.push(IoSlice::new(&data[..end]));
174    }
175
176    flush_iov(out, &iov)
177}
178
179/// After-separator mode: chunk-based forward SIMD scan, emitted in reverse.
180///
181/// Instead of calling memrchr once per line (high per-call overhead for short
182/// lines), we process the file backward in large chunks. Within each chunk a
183/// single memchr_iter forward pass finds ALL separator positions with full SIMD
184/// pipeline utilisation, then we emit the records (slices) in reverse order.
185///
186/// This converts ~200K memrchr calls (for a 10MB / 50-byte-line file) into
187/// ~20 memchr_iter calls, each scanning a contiguous 512KB region.
188///
189/// Optimization: Instead of collecting positions into a Vec, we store them
190/// in a stack-allocated array to avoid heap allocation per chunk.
191fn tac_bytes_backward_after(data: &[u8], sep: u8, out: &mut impl Write) -> io::Result<()> {
192    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
193
194    // `global_end` tracks where the current (rightmost unseen) record ends.
195    let mut global_end = data.len();
196
197    // Stack-allocated positions buffer: avoid heap allocation per chunk.
198    // 512KB chunk / 1 byte per separator = 512K max positions.
199    // We use a heap-allocated buffer once but reuse across all chunks.
200    let mut positions_buf: Vec<usize> = Vec::with_capacity(CHUNK);
201
202    let mut chunk_start = data.len().saturating_sub(CHUNK);
203
204    loop {
205        let chunk_end = global_end.min(data.len());
206        if chunk_start >= chunk_end {
207            if chunk_start == 0 {
208                break;
209            }
210            chunk_start = chunk_start.saturating_sub(CHUNK);
211            continue;
212        }
213        let chunk = &data[chunk_start..chunk_end];
214
215        // Reuse positions buffer: clear and refill without reallocation.
216        positions_buf.clear();
217        positions_buf.extend(memchr::memchr_iter(sep, chunk).map(|p| p + chunk_start));
218
219        // Emit records in reverse (rightmost first).
220        for &pos in positions_buf.iter().rev() {
221            if pos + 1 < global_end {
222                iov.push(IoSlice::new(&data[pos + 1..global_end]));
223            }
224            global_end = pos + 1;
225
226            if iov.len() >= MAX_IOV {
227                flush_iov(out, &iov)?;
228                iov.clear();
229            }
230        }
231
232        if chunk_start == 0 {
233            break;
234        }
235        chunk_start = chunk_start.saturating_sub(CHUNK);
236    }
237
238    // First record
239    if global_end > 0 {
240        iov.push(IoSlice::new(&data[0..global_end]));
241    }
242    flush_iov(out, &iov)?;
243    Ok(())
244}
245
246/// Before-separator mode: chunk-based forward SIMD scan, emitted in reverse.
247///
248/// Same chunked strategy as after-mode, but each record STARTS with its
249/// separator byte instead of ending with it.
250fn tac_bytes_backward_before(data: &[u8], sep: u8, out: &mut impl Write) -> io::Result<()> {
251    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
252
253    let mut global_end = data.len();
254    let mut chunk_start = data.len().saturating_sub(CHUNK);
255    let mut positions_buf: Vec<usize> = Vec::with_capacity(CHUNK);
256
257    loop {
258        let chunk_end = global_end.min(data.len());
259        if chunk_start >= chunk_end {
260            if chunk_start == 0 {
261                break;
262            }
263            chunk_start = chunk_start.saturating_sub(CHUNK);
264            continue;
265        }
266        let chunk = &data[chunk_start..chunk_end];
267
268        positions_buf.clear();
269        positions_buf.extend(memchr::memchr_iter(sep, chunk).map(|p| p + chunk_start));
270
271        for &pos in positions_buf.iter().rev() {
272            iov.push(IoSlice::new(&data[pos..global_end]));
273            global_end = pos;
274
275            if iov.len() >= MAX_IOV {
276                flush_iov(out, &iov)?;
277                iov.clear();
278            }
279        }
280
281        if chunk_start == 0 {
282            break;
283        }
284        chunk_start = chunk_start.saturating_sub(CHUNK);
285    }
286
287    if global_end > 0 {
288        iov.push(IoSlice::new(&data[0..global_end]));
289    }
290    flush_iov(out, &iov)?;
291    Ok(())
292}
293
294/// Reverse records using a multi-byte string separator.
295/// Uses chunk-based forward SIMD-accelerated memmem + IoSlice zero-copy output.
296///
297/// For single-byte separators, delegates to tac_bytes which uses memchr (faster).
298pub fn tac_string_separator(
299    data: &[u8],
300    separator: &[u8],
301    before: bool,
302    out: &mut impl Write,
303) -> io::Result<()> {
304    if data.is_empty() {
305        return Ok(());
306    }
307
308    if separator.len() == 1 {
309        return tac_bytes(data, separator[0], before, out);
310    }
311
312    let sep_len = separator.len();
313
314    // For multi-byte separators we use the same chunk-based strategy but with
315    // memmem::find_iter instead of memchr_iter. We need FinderRev only for a
316    // quick "any separator exists?" check — the actual work uses forward Finder.
317    //
318    // We still need to handle the case where a separator straddles a chunk
319    // boundary. We do this by extending each chunk's left edge by (sep_len - 1)
320    // bytes and deduplicating matches that fall in the overlap zone.
321
322    if !before {
323        tac_string_after(data, separator, sep_len, out)
324    } else {
325        tac_string_before(data, separator, sep_len, out)
326    }
327}
328
329/// Multi-byte string separator, after mode (separator at end of record).
330fn tac_string_after(
331    data: &[u8],
332    separator: &[u8],
333    sep_len: usize,
334    out: &mut impl Write,
335) -> io::Result<()> {
336    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
337    let mut global_end = data.len();
338    let mut chunk_start = data.len().saturating_sub(CHUNK);
339    let mut positions_buf: Vec<usize> = Vec::with_capacity(CHUNK / 4);
340
341    loop {
342        let chunk_end = global_end.min(data.len());
343        let scan_start = chunk_start.saturating_sub(sep_len - 1);
344        if scan_start >= chunk_end {
345            if chunk_start == 0 {
346                break;
347            }
348            chunk_start = chunk_start.saturating_sub(CHUNK);
349            continue;
350        }
351        let scan_region = &data[scan_start..chunk_end];
352
353        positions_buf.clear();
354        positions_buf.extend(
355            memchr::memmem::find_iter(scan_region, separator)
356                .map(|p| p + scan_start)
357                .filter(|&p| p >= chunk_start || chunk_start == 0)
358                .filter(|&p| p + sep_len <= global_end),
359        );
360
361        for &pos in positions_buf.iter().rev() {
362            let rec_end_exclusive = pos + sep_len;
363            if rec_end_exclusive < global_end {
364                iov.push(IoSlice::new(&data[rec_end_exclusive..global_end]));
365            }
366            global_end = rec_end_exclusive;
367            if iov.len() >= MAX_IOV {
368                flush_iov(out, &iov)?;
369                iov.clear();
370            }
371        }
372
373        if chunk_start == 0 {
374            break;
375        }
376        chunk_start = chunk_start.saturating_sub(CHUNK);
377    }
378
379    if global_end > 0 {
380        iov.push(IoSlice::new(&data[0..global_end]));
381    }
382    flush_iov(out, &iov)?;
383    Ok(())
384}
385
386/// Multi-byte string separator, before mode (separator at start of record).
387fn tac_string_before(
388    data: &[u8],
389    separator: &[u8],
390    sep_len: usize,
391    out: &mut impl Write,
392) -> io::Result<()> {
393    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
394    let mut global_end = data.len();
395    let mut chunk_start = data.len().saturating_sub(CHUNK);
396    let mut positions_buf: Vec<usize> = Vec::with_capacity(CHUNK / 4);
397
398    loop {
399        let chunk_end = global_end.min(data.len());
400        let scan_start = chunk_start.saturating_sub(sep_len - 1);
401        if scan_start >= chunk_end {
402            if chunk_start == 0 {
403                break;
404            }
405            chunk_start = chunk_start.saturating_sub(CHUNK);
406            continue;
407        }
408        let scan_region = &data[scan_start..chunk_end];
409
410        positions_buf.clear();
411        positions_buf.extend(
412            memchr::memmem::find_iter(scan_region, separator)
413                .map(|p| p + scan_start)
414                .filter(|&p| p >= chunk_start || chunk_start == 0)
415                .filter(|&p| p < global_end),
416        );
417
418        for &pos in positions_buf.iter().rev() {
419            iov.push(IoSlice::new(&data[pos..global_end]));
420            global_end = pos;
421            if iov.len() >= MAX_IOV {
422                flush_iov(out, &iov)?;
423                iov.clear();
424            }
425        }
426
427        if chunk_start == 0 {
428            break;
429        }
430        chunk_start = chunk_start.saturating_sub(CHUNK);
431    }
432
433    if global_end > 0 {
434        iov.push(IoSlice::new(&data[0..global_end]));
435    }
436    flush_iov(out, &iov)?;
437    Ok(())
438}
439
440/// Find regex matches using backward scanning, matching GNU tac's re_search behavior.
441fn find_regex_matches_backward(data: &[u8], re: &regex::bytes::Regex) -> Vec<(usize, usize)> {
442    let mut matches = Vec::new();
443    let mut past_end = data.len();
444
445    while past_end > 0 {
446        let buf = &data[..past_end];
447        let mut found = false;
448
449        let mut pos = past_end;
450        while pos > 0 {
451            pos -= 1;
452            if let Some(m) = re.find_at(buf, pos) {
453                if m.start() == pos {
454                    matches.push((m.start(), m.end()));
455                    past_end = m.start();
456                    found = true;
457                    break;
458                }
459            }
460        }
461
462        if !found {
463            break;
464        }
465    }
466
467    matches.reverse();
468    matches
469}
470
471/// Reverse records using a regex separator.
472pub fn tac_regex_separator(
473    data: &[u8],
474    pattern: &str,
475    before: bool,
476    out: &mut impl Write,
477) -> io::Result<()> {
478    if data.is_empty() {
479        return Ok(());
480    }
481
482    let re = match regex::bytes::Regex::new(pattern) {
483        Ok(r) => r,
484        Err(e) => {
485            return Err(io::Error::new(
486                io::ErrorKind::InvalidInput,
487                format!("invalid regex '{}': {}", pattern, e),
488            ));
489        }
490    };
491
492    let matches = find_regex_matches_backward(data, &re);
493
494    if matches.is_empty() {
495        out.write_all(data)?;
496        return Ok(());
497    }
498
499    let mut iov: Vec<IoSlice> = Vec::with_capacity(matches.len().min(MAX_IOV) + 2);
500
501    if !before {
502        let last_end = matches.last().unwrap().1;
503
504        if last_end < data.len() {
505            iov.push(IoSlice::new(&data[last_end..]));
506        }
507
508        let mut i = matches.len();
509        while i > 0 {
510            i -= 1;
511            let rec_start = if i == 0 { 0 } else { matches[i - 1].1 };
512            iov.push(IoSlice::new(&data[rec_start..matches[i].1]));
513            if iov.len() >= MAX_IOV {
514                flush_iov(out, &iov)?;
515                iov.clear();
516            }
517        }
518    } else {
519        let mut i = matches.len();
520        while i > 0 {
521            i -= 1;
522            let start = matches[i].0;
523            let end = if i + 1 < matches.len() {
524                matches[i + 1].0
525            } else {
526                data.len()
527            };
528            iov.push(IoSlice::new(&data[start..end]));
529            if iov.len() >= MAX_IOV {
530                flush_iov(out, &iov)?;
531                iov.clear();
532            }
533        }
534
535        if matches[0].0 > 0 {
536            iov.push(IoSlice::new(&data[..matches[0].0]));
537        }
538    }
539
540    flush_iov(out, &iov)?;
541    Ok(())
542}