Skip to main content

coreutils_rs/tac/
core.rs

1use std::io::{self, IoSlice, Write};
2
3/// Max IoSlice entries per write_vectored batch.
4/// Linux UIO_MAXIOV is 1024; we use that as our batch limit.
5const MAX_IOV: usize = 1024;
6
7/// Chunk size for the forward-scan-within-backward-chunks strategy.
8/// 4MB gives better SIMD throughput per memchr_iter call (fewer calls needed,
9/// each processing a larger contiguous region with full AVX2 pipeline).
10/// For a 10MB file with 50-byte lines, 4MB chunks = 3 calls vs 5 at 2MB.
11const CHUNK: usize = 4 * 1024 * 1024;
12
13/// Flush a batch of IoSlice entries using write_vectored.
14/// Falls back to individual write_all for each slice if write_vectored
15/// doesn't write everything (handles partial writes).
16#[inline]
17fn flush_iov(out: &mut impl Write, slices: &[IoSlice]) -> io::Result<()> {
18    if slices.is_empty() {
19        return Ok(());
20    }
21    // Try write_vectored first for the whole batch
22    let total: usize = slices.iter().map(|s| s.len()).sum();
23
24    // Fast path: single writev call often writes everything
25    let written = match out.write_vectored(slices) {
26        Ok(n) if n >= total => return Ok(()),
27        Ok(n) => n,
28        Err(e) => return Err(e),
29    };
30
31    // Slow path: partial write — fall back to write_all per remaining slice
32    let mut skip = written;
33    for slice in slices {
34        let slen = slice.len();
35        if skip >= slen {
36            skip -= slen;
37            continue;
38        }
39        if skip > 0 {
40            out.write_all(&slice[skip..])?;
41            skip = 0;
42        } else {
43            out.write_all(slice)?;
44        }
45    }
46    Ok(())
47}
48
49/// Reverse records separated by a single byte.
50/// Uses chunk-based forward SIMD scan processed in reverse order for
51/// maximum throughput — eliminates per-line memrchr call overhead.
52/// Output uses write_vectored (writev) for zero-copy from mmap'd data.
53///
54/// For data up to CONTIG_LIMIT, builds a contiguous reversed output buffer
55/// and writes it in a single write() call. This is faster for pipe output
56/// because it eliminates the overhead of many writev() syscalls (a 10MB file
57/// with 50-byte lines would need ~200 writev calls of 1024 entries each;
58/// a single 10MB write() is much faster).
59///
60/// For larger data, uses the writev approach to avoid doubling memory usage.
61pub fn tac_bytes(data: &[u8], separator: u8, before: bool, out: &mut impl Write) -> io::Result<()> {
62    if data.is_empty() {
63        return Ok(());
64    }
65    // For data up to 32MB, build contiguous output buffer for single write().
66    // This trades one memcpy (fast, sequential) for N writev syscalls.
67    if data.len() <= CONTIG_LIMIT {
68        if !before {
69            tac_bytes_contig_after(data, separator, out)
70        } else {
71            tac_bytes_contig_before(data, separator, out)
72        }
73    } else if !before {
74        tac_bytes_backward_after(data, separator, out)
75    } else {
76        tac_bytes_backward_before(data, separator, out)
77    }
78}
79
80/// Threshold for contiguous output buffer strategy.
81/// Up to 32MB, the extra memcpy is cheaper than hundreds of writev() syscalls.
82const CONTIG_LIMIT: usize = 32 * 1024 * 1024;
83
84/// Contiguous-output after-separator mode: build reversed output in one buffer,
85/// then write it all at once. For pipe output, a single large write() is
86/// significantly faster than many writev() calls.
87fn tac_bytes_contig_after(data: &[u8], sep: u8, out: &mut impl Write) -> io::Result<()> {
88    // Pre-allocate output buffer (same size as input since tac preserves all bytes)
89    let mut outbuf: Vec<u8> = Vec::with_capacity(data.len());
90    let dst = outbuf.as_mut_ptr();
91    let mut wp = 0usize;
92
93    let mut global_end = data.len();
94    let mut positions_buf: Vec<usize> = Vec::with_capacity(CHUNK);
95    let mut chunk_start = data.len().saturating_sub(CHUNK);
96
97    loop {
98        let chunk_end = global_end.min(data.len());
99        if chunk_start >= chunk_end {
100            if chunk_start == 0 {
101                break;
102            }
103            chunk_start = chunk_start.saturating_sub(CHUNK);
104            continue;
105        }
106        let chunk = &data[chunk_start..chunk_end];
107
108        positions_buf.clear();
109        positions_buf.extend(memchr::memchr_iter(sep, chunk).map(|p| p + chunk_start));
110
111        for &pos in positions_buf.iter().rev() {
112            let rec_start = pos + 1;
113            let rec_len = global_end - rec_start;
114            if rec_len > 0 {
115                unsafe {
116                    std::ptr::copy_nonoverlapping(
117                        data.as_ptr().add(rec_start),
118                        dst.add(wp),
119                        rec_len,
120                    );
121                }
122                wp += rec_len;
123            }
124            global_end = pos + 1;
125        }
126
127        if chunk_start == 0 {
128            break;
129        }
130        chunk_start = chunk_start.saturating_sub(CHUNK);
131    }
132
133    // First record
134    if global_end > 0 {
135        unsafe {
136            std::ptr::copy_nonoverlapping(data.as_ptr(), dst.add(wp), global_end);
137        }
138        wp += global_end;
139    }
140
141    unsafe { outbuf.set_len(wp) };
142    out.write_all(&outbuf)
143}
144
145/// Contiguous-output before-separator mode.
146fn tac_bytes_contig_before(data: &[u8], sep: u8, out: &mut impl Write) -> io::Result<()> {
147    let mut outbuf: Vec<u8> = Vec::with_capacity(data.len());
148    let dst = outbuf.as_mut_ptr();
149    let mut wp = 0usize;
150
151    let mut global_end = data.len();
152    let mut positions_buf: Vec<usize> = Vec::with_capacity(CHUNK);
153    let mut chunk_start = data.len().saturating_sub(CHUNK);
154
155    loop {
156        let chunk_end = global_end.min(data.len());
157        if chunk_start >= chunk_end {
158            if chunk_start == 0 {
159                break;
160            }
161            chunk_start = chunk_start.saturating_sub(CHUNK);
162            continue;
163        }
164        let chunk = &data[chunk_start..chunk_end];
165
166        positions_buf.clear();
167        positions_buf.extend(memchr::memchr_iter(sep, chunk).map(|p| p + chunk_start));
168
169        for &pos in positions_buf.iter().rev() {
170            let rec_len = global_end - pos;
171            if rec_len > 0 {
172                unsafe {
173                    std::ptr::copy_nonoverlapping(data.as_ptr().add(pos), dst.add(wp), rec_len);
174                }
175                wp += rec_len;
176            }
177            global_end = pos;
178        }
179
180        if chunk_start == 0 {
181            break;
182        }
183        chunk_start = chunk_start.saturating_sub(CHUNK);
184    }
185
186    if global_end > 0 {
187        unsafe {
188            std::ptr::copy_nonoverlapping(data.as_ptr(), dst.add(wp), global_end);
189        }
190        wp += global_end;
191    }
192
193    unsafe { outbuf.set_len(wp) };
194    out.write_all(&outbuf)
195}
196
197/// After-separator mode: chunk-based forward SIMD scan, emitted in reverse.
198///
199/// Instead of calling memrchr once per line (high per-call overhead for short
200/// lines), we process the file backward in large chunks. Within each chunk a
201/// single memchr_iter forward pass finds ALL separator positions with full SIMD
202/// pipeline utilisation, then we emit the records (slices) in reverse order.
203///
204/// This converts ~200K memrchr calls (for a 10MB / 50-byte-line file) into
205/// ~20 memchr_iter calls, each scanning a contiguous 512KB region.
206///
207/// Optimization: Instead of collecting positions into a Vec, we store them
208/// in a stack-allocated array to avoid heap allocation per chunk.
209fn tac_bytes_backward_after(data: &[u8], sep: u8, out: &mut impl Write) -> io::Result<()> {
210    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
211
212    // `global_end` tracks where the current (rightmost unseen) record ends.
213    let mut global_end = data.len();
214
215    // Stack-allocated positions buffer: avoid heap allocation per chunk.
216    // 512KB chunk / 1 byte per separator = 512K max positions.
217    // We use a heap-allocated buffer once but reuse across all chunks.
218    let mut positions_buf: Vec<usize> = Vec::with_capacity(CHUNK);
219
220    let mut chunk_start = data.len().saturating_sub(CHUNK);
221
222    loop {
223        let chunk_end = global_end.min(data.len());
224        if chunk_start >= chunk_end {
225            if chunk_start == 0 {
226                break;
227            }
228            chunk_start = chunk_start.saturating_sub(CHUNK);
229            continue;
230        }
231        let chunk = &data[chunk_start..chunk_end];
232
233        // Reuse positions buffer: clear and refill without reallocation.
234        positions_buf.clear();
235        positions_buf.extend(memchr::memchr_iter(sep, chunk).map(|p| p + chunk_start));
236
237        // Emit records in reverse (rightmost first).
238        for &pos in positions_buf.iter().rev() {
239            if pos + 1 < global_end {
240                iov.push(IoSlice::new(&data[pos + 1..global_end]));
241            }
242            global_end = pos + 1;
243
244            if iov.len() >= MAX_IOV {
245                flush_iov(out, &iov)?;
246                iov.clear();
247            }
248        }
249
250        if chunk_start == 0 {
251            break;
252        }
253        chunk_start = chunk_start.saturating_sub(CHUNK);
254    }
255
256    // First record
257    if global_end > 0 {
258        iov.push(IoSlice::new(&data[0..global_end]));
259    }
260    flush_iov(out, &iov)?;
261    Ok(())
262}
263
264/// Before-separator mode: chunk-based forward SIMD scan, emitted in reverse.
265///
266/// Same chunked strategy as after-mode, but each record STARTS with its
267/// separator byte instead of ending with it.
268fn tac_bytes_backward_before(data: &[u8], sep: u8, out: &mut impl Write) -> io::Result<()> {
269    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
270
271    let mut global_end = data.len();
272    let mut chunk_start = data.len().saturating_sub(CHUNK);
273    let mut positions_buf: Vec<usize> = Vec::with_capacity(CHUNK);
274
275    loop {
276        let chunk_end = global_end.min(data.len());
277        if chunk_start >= chunk_end {
278            if chunk_start == 0 {
279                break;
280            }
281            chunk_start = chunk_start.saturating_sub(CHUNK);
282            continue;
283        }
284        let chunk = &data[chunk_start..chunk_end];
285
286        positions_buf.clear();
287        positions_buf.extend(memchr::memchr_iter(sep, chunk).map(|p| p + chunk_start));
288
289        for &pos in positions_buf.iter().rev() {
290            iov.push(IoSlice::new(&data[pos..global_end]));
291            global_end = pos;
292
293            if iov.len() >= MAX_IOV {
294                flush_iov(out, &iov)?;
295                iov.clear();
296            }
297        }
298
299        if chunk_start == 0 {
300            break;
301        }
302        chunk_start = chunk_start.saturating_sub(CHUNK);
303    }
304
305    if global_end > 0 {
306        iov.push(IoSlice::new(&data[0..global_end]));
307    }
308    flush_iov(out, &iov)?;
309    Ok(())
310}
311
312/// Reverse records using a multi-byte string separator.
313/// Uses chunk-based forward SIMD-accelerated memmem + IoSlice zero-copy output.
314///
315/// For single-byte separators, delegates to tac_bytes which uses memchr (faster).
316pub fn tac_string_separator(
317    data: &[u8],
318    separator: &[u8],
319    before: bool,
320    out: &mut impl Write,
321) -> io::Result<()> {
322    if data.is_empty() {
323        return Ok(());
324    }
325
326    if separator.len() == 1 {
327        return tac_bytes(data, separator[0], before, out);
328    }
329
330    let sep_len = separator.len();
331
332    // For multi-byte separators we use the same chunk-based strategy but with
333    // memmem::find_iter instead of memchr_iter. We need FinderRev only for a
334    // quick "any separator exists?" check — the actual work uses forward Finder.
335    //
336    // We still need to handle the case where a separator straddles a chunk
337    // boundary. We do this by extending each chunk's left edge by (sep_len - 1)
338    // bytes and deduplicating matches that fall in the overlap zone.
339
340    if !before {
341        tac_string_after(data, separator, sep_len, out)
342    } else {
343        tac_string_before(data, separator, sep_len, out)
344    }
345}
346
347/// Multi-byte string separator, after mode (separator at end of record).
348fn tac_string_after(
349    data: &[u8],
350    separator: &[u8],
351    sep_len: usize,
352    out: &mut impl Write,
353) -> io::Result<()> {
354    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
355    let mut global_end = data.len();
356    let mut chunk_start = data.len().saturating_sub(CHUNK);
357    let mut positions_buf: Vec<usize> = Vec::with_capacity(CHUNK / 4);
358
359    loop {
360        let chunk_end = global_end.min(data.len());
361        let scan_start = chunk_start.saturating_sub(sep_len - 1);
362        if scan_start >= chunk_end {
363            if chunk_start == 0 {
364                break;
365            }
366            chunk_start = chunk_start.saturating_sub(CHUNK);
367            continue;
368        }
369        let scan_region = &data[scan_start..chunk_end];
370
371        positions_buf.clear();
372        positions_buf.extend(
373            memchr::memmem::find_iter(scan_region, separator)
374                .map(|p| p + scan_start)
375                .filter(|&p| p >= chunk_start || chunk_start == 0)
376                .filter(|&p| p + sep_len <= global_end),
377        );
378
379        for &pos in positions_buf.iter().rev() {
380            let rec_end_exclusive = pos + sep_len;
381            if rec_end_exclusive < global_end {
382                iov.push(IoSlice::new(&data[rec_end_exclusive..global_end]));
383            }
384            global_end = rec_end_exclusive;
385            if iov.len() >= MAX_IOV {
386                flush_iov(out, &iov)?;
387                iov.clear();
388            }
389        }
390
391        if chunk_start == 0 {
392            break;
393        }
394        chunk_start = chunk_start.saturating_sub(CHUNK);
395    }
396
397    if global_end > 0 {
398        iov.push(IoSlice::new(&data[0..global_end]));
399    }
400    flush_iov(out, &iov)?;
401    Ok(())
402}
403
404/// Multi-byte string separator, before mode (separator at start of record).
405fn tac_string_before(
406    data: &[u8],
407    separator: &[u8],
408    sep_len: usize,
409    out: &mut impl Write,
410) -> io::Result<()> {
411    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
412    let mut global_end = data.len();
413    let mut chunk_start = data.len().saturating_sub(CHUNK);
414    let mut positions_buf: Vec<usize> = Vec::with_capacity(CHUNK / 4);
415
416    loop {
417        let chunk_end = global_end.min(data.len());
418        let scan_start = chunk_start.saturating_sub(sep_len - 1);
419        if scan_start >= chunk_end {
420            if chunk_start == 0 {
421                break;
422            }
423            chunk_start = chunk_start.saturating_sub(CHUNK);
424            continue;
425        }
426        let scan_region = &data[scan_start..chunk_end];
427
428        positions_buf.clear();
429        positions_buf.extend(
430            memchr::memmem::find_iter(scan_region, separator)
431                .map(|p| p + scan_start)
432                .filter(|&p| p >= chunk_start || chunk_start == 0)
433                .filter(|&p| p < global_end),
434        );
435
436        for &pos in positions_buf.iter().rev() {
437            iov.push(IoSlice::new(&data[pos..global_end]));
438            global_end = pos;
439            if iov.len() >= MAX_IOV {
440                flush_iov(out, &iov)?;
441                iov.clear();
442            }
443        }
444
445        if chunk_start == 0 {
446            break;
447        }
448        chunk_start = chunk_start.saturating_sub(CHUNK);
449    }
450
451    if global_end > 0 {
452        iov.push(IoSlice::new(&data[0..global_end]));
453    }
454    flush_iov(out, &iov)?;
455    Ok(())
456}
457
458/// Find regex matches using backward scanning, matching GNU tac's re_search behavior.
459fn find_regex_matches_backward(data: &[u8], re: &regex::bytes::Regex) -> Vec<(usize, usize)> {
460    let mut matches = Vec::new();
461    let mut past_end = data.len();
462
463    while past_end > 0 {
464        let buf = &data[..past_end];
465        let mut found = false;
466
467        let mut pos = past_end;
468        while pos > 0 {
469            pos -= 1;
470            if let Some(m) = re.find_at(buf, pos) {
471                if m.start() == pos {
472                    matches.push((m.start(), m.end()));
473                    past_end = m.start();
474                    found = true;
475                    break;
476                }
477            }
478        }
479
480        if !found {
481            break;
482        }
483    }
484
485    matches.reverse();
486    matches
487}
488
489/// Reverse records using a regex separator.
490pub fn tac_regex_separator(
491    data: &[u8],
492    pattern: &str,
493    before: bool,
494    out: &mut impl Write,
495) -> io::Result<()> {
496    if data.is_empty() {
497        return Ok(());
498    }
499
500    let re = match regex::bytes::Regex::new(pattern) {
501        Ok(r) => r,
502        Err(e) => {
503            return Err(io::Error::new(
504                io::ErrorKind::InvalidInput,
505                format!("invalid regex '{}': {}", pattern, e),
506            ));
507        }
508    };
509
510    let matches = find_regex_matches_backward(data, &re);
511
512    if matches.is_empty() {
513        out.write_all(data)?;
514        return Ok(());
515    }
516
517    let mut iov: Vec<IoSlice> = Vec::with_capacity(matches.len().min(MAX_IOV) + 2);
518
519    if !before {
520        let last_end = matches.last().unwrap().1;
521
522        if last_end < data.len() {
523            iov.push(IoSlice::new(&data[last_end..]));
524        }
525
526        let mut i = matches.len();
527        while i > 0 {
528            i -= 1;
529            let rec_start = if i == 0 { 0 } else { matches[i - 1].1 };
530            iov.push(IoSlice::new(&data[rec_start..matches[i].1]));
531            if iov.len() >= MAX_IOV {
532                flush_iov(out, &iov)?;
533                iov.clear();
534            }
535        }
536    } else {
537        let mut i = matches.len();
538        while i > 0 {
539            i -= 1;
540            let start = matches[i].0;
541            let end = if i + 1 < matches.len() {
542                matches[i + 1].0
543            } else {
544                data.len()
545            };
546            iov.push(IoSlice::new(&data[start..end]));
547            if iov.len() >= MAX_IOV {
548                flush_iov(out, &iov)?;
549                iov.clear();
550            }
551        }
552
553        if matches[0].0 > 0 {
554            iov.push(IoSlice::new(&data[..matches[0].0]));
555        }
556    }
557
558    flush_iov(out, &iov)?;
559    Ok(())
560}