Skip to main content

coreutils_rs/common/
io.rs

1use std::fs::{self, File};
2use std::io::{self, Read};
3use std::ops::Deref;
4use std::path::Path;
5
6#[cfg(target_os = "linux")]
7use std::sync::atomic::{AtomicBool, Ordering};
8
9use memmap2::{Mmap, MmapOptions};
10
11/// Holds file data — either zero-copy mmap or an owned Vec.
12/// Dereferences to `&[u8]` for transparent use.
13pub enum FileData {
14    Mmap(Mmap),
15    Owned(Vec<u8>),
16}
17
18impl Deref for FileData {
19    type Target = [u8];
20
21    fn deref(&self) -> &[u8] {
22        match self {
23            FileData::Mmap(m) => m,
24            FileData::Owned(v) => v,
25        }
26    }
27}
28
29/// Threshold below which we use read() instead of mmap.
30/// For files under 1MB, read() is faster since mmap has setup/teardown overhead
31/// (page table creation for up to 256 pages, TLB flush on munmap) that exceeds
32/// the zero-copy benefit.
33pub const MMAP_THRESHOLD: u64 = 1024 * 1024;
34
35/// Track whether O_NOATIME is supported to avoid repeated failed open() attempts.
36/// After the first EPERM, we never try O_NOATIME again (saves one syscall per file).
37#[cfg(target_os = "linux")]
38static NOATIME_SUPPORTED: AtomicBool = AtomicBool::new(true);
39
40/// Open a file with O_NOATIME on Linux to avoid atime inode writes.
41/// Caches whether O_NOATIME works to avoid double-open on every file.
42#[cfg(target_os = "linux")]
43pub fn open_noatime(path: &Path) -> io::Result<File> {
44    use std::os::unix::fs::OpenOptionsExt;
45    if NOATIME_SUPPORTED.load(Ordering::Relaxed) {
46        match fs::OpenOptions::new()
47            .read(true)
48            .custom_flags(libc::O_NOATIME)
49            .open(path)
50        {
51            Ok(f) => return Ok(f),
52            Err(ref e) if e.raw_os_error() == Some(libc::EPERM) => {
53                // O_NOATIME requires file ownership or CAP_FOWNER — disable globally
54                NOATIME_SUPPORTED.store(false, Ordering::Relaxed);
55            }
56            Err(e) => return Err(e), // Real error, propagate
57        }
58    }
59    File::open(path)
60}
61
62#[cfg(not(target_os = "linux"))]
63pub fn open_noatime(path: &Path) -> io::Result<File> {
64    File::open(path)
65}
66
67/// Controls mmap prefault strategy for `read_file_with_hints`.
68///
69/// Different tools benefit from different page-fault strategies:
70/// - **Eager** (default): `POPULATE_READ` prefaults all pages upfront, best when
71///   the entire file will be processed and startup latency is acceptable.
72/// - **Lazy**: skips `POPULATE_READ`, lets pages fault on demand during SIMD scans.
73///   Best for tools like `nl` where memchr overlaps with lazy faults, avoiding
74///   the ~2ms upfront populate cost on 10MB files.
75///
76/// Both modes always apply `HugePage` (>= 2MB) and `Sequential`.
77/// `WillNeed` is applied for files in the 1-4MB range in both modes, and as a
78/// fallback for `PopulateRead` failure in Eager mode.
79#[derive(Debug, Clone, Copy, PartialEq, Eq)]
80pub enum MmapHints {
81    /// Prefault pages eagerly via POPULATE_READ (or WillNeed fallback).
82    Eager,
83    /// Let pages fault lazily during access. Still applies WillNeed for 1-4MB
84    /// files where lazy faults won't overlap with processing.
85    Lazy,
86}
87
88/// Read a file with zero-copy mmap for large files or read() for small files.
89/// Opens once with O_NOATIME, uses fstat for metadata to save a syscall.
90pub fn read_file(path: &Path) -> io::Result<FileData> {
91    read_file_with_hints(path, MmapHints::Eager)
92}
93
94/// Read a file with configurable mmap prefault strategy.
95///
96/// See [`MmapHints`] for the available strategies. Use `MmapHints::Eager` for
97/// tools that benefit from upfront prefaulting, or `MmapHints::Lazy` for tools
98/// where SIMD scanning overlaps with lazy page faults.
99pub fn read_file_with_hints(path: &Path, hints: MmapHints) -> io::Result<FileData> {
100    let _ = &hints; // Used only on Linux for mmap advisory selection
101    let file = open_noatime(path)?;
102    let metadata = file.metadata()?;
103    let len = metadata.len();
104
105    if len > 0 && metadata.file_type().is_file() {
106        // Small files: exact-size read from already-open fd.
107        // Uses read_full into pre-sized buffer instead of read_to_end,
108        // which avoids the grow-and-probe pattern (saves 1-2 extra read() syscalls).
109        if len < MMAP_THRESHOLD {
110            let mut buf = vec![0u8; len as usize];
111            let n = read_full(&mut &file, &mut buf)?;
112            buf.truncate(n);
113            return Ok(FileData::Owned(buf));
114        }
115
116        // SAFETY: Read-only mapping. No MAP_POPULATE — it synchronously faults
117        // all pages with 4KB before MADV_HUGEPAGE can take effect, causing ~25,600
118        // minor page faults for 100MB (~12.5ms overhead). Without it, HUGEPAGE hint
119        // is set first, then POPULATE_READ prefaults using 2MB pages (~50 faults).
120        match unsafe { MmapOptions::new().map(&file) } {
121            Ok(mmap) => {
122                #[cfg(target_os = "linux")]
123                {
124                    // HUGEPAGE MUST come first: reduces 25,600 minor faults (4KB) to
125                    // ~50 faults (2MB) for 100MB files. Saves ~12ms of page fault overhead.
126                    if len >= 2 * 1024 * 1024 {
127                        let _ = mmap.advise(memmap2::Advice::HugePage);
128                    }
129                    let _ = mmap.advise(memmap2::Advice::Sequential);
130                    match hints {
131                        MmapHints::Eager => {
132                            // POPULATE_READ (5.14+): prefault with huge pages.
133                            // Fall back to WillNeed on older kernels.
134                            if len >= 4 * 1024 * 1024 {
135                                if mmap.advise(memmap2::Advice::PopulateRead).is_err() {
136                                    let _ = mmap.advise(memmap2::Advice::WillNeed);
137                                }
138                            } else {
139                                let _ = mmap.advise(memmap2::Advice::WillNeed);
140                            }
141                        }
142                        MmapHints::Lazy => {
143                            // Skip PopulateRead: pages fault on demand during SIMD scans.
144                            // Still apply WillNeed for 1-4MB files where lazy faults
145                            // won't overlap with processing (cold-cache penalty).
146                            if len < 4 * 1024 * 1024 {
147                                let _ = mmap.advise(memmap2::Advice::WillNeed);
148                            }
149                        }
150                    }
151                }
152                Ok(FileData::Mmap(mmap))
153            }
154            Err(_) => {
155                // mmap failed — fall back to read
156                let mut buf = Vec::with_capacity(len as usize);
157                let mut reader = file;
158                reader.read_to_end(&mut buf)?;
159                Ok(FileData::Owned(buf))
160            }
161        }
162    } else if !metadata.file_type().is_file() {
163        // Non-regular file (pipe, FIFO, device, process substitution) — read from open fd.
164        // Pipes report len=0 from stat(), so we must always try to read regardless of len.
165        let mut buf = Vec::new();
166        let mut reader = file;
167        reader.read_to_end(&mut buf)?;
168        Ok(FileData::Owned(buf))
169    } else {
170        Ok(FileData::Owned(Vec::new()))
171    }
172}
173
174/// Read a file entirely into a mutable Vec.
175/// Uses exact-size allocation from fstat + single read() for efficiency.
176/// Preferred over mmap when the caller needs mutable access (e.g., in-place decode).
177pub fn read_file_vec(path: &Path) -> io::Result<Vec<u8>> {
178    let file = open_noatime(path)?;
179    let metadata = file.metadata()?;
180    let len = metadata.len() as usize;
181    if len == 0 {
182        return Ok(Vec::new());
183    }
184    let mut buf = vec![0u8; len];
185    let n = read_full(&mut &file, &mut buf)?;
186    buf.truncate(n);
187    Ok(buf)
188}
189
190/// Read a file always using mmap, with optimal page fault strategy.
191/// Used by tac for zero-copy output and parallel scanning.
192///
193/// Strategy: mmap WITHOUT MAP_POPULATE, then MADV_HUGEPAGE + MADV_POPULATE_READ.
194/// MAP_POPULATE synchronously faults all pages with 4KB BEFORE MADV_HUGEPAGE
195/// can take effect, causing ~25,600 minor faults for 100MB (~12.5ms overhead).
196/// MADV_POPULATE_READ (Linux 5.14+) prefaults pages AFTER HUGEPAGE is set,
197/// using 2MB huge pages (~50 faults = ~0.1ms). Falls back to WILLNEED on
198/// older kernels.
199pub fn read_file_mmap(path: &Path) -> io::Result<FileData> {
200    let file = open_noatime(path)?;
201    let metadata = file.metadata()?;
202    let len = metadata.len();
203
204    if len > 0 && metadata.file_type().is_file() {
205        // No MAP_POPULATE: let MADV_HUGEPAGE take effect before page faults.
206        let mmap_result = unsafe { MmapOptions::new().map(&file) };
207        match mmap_result {
208            Ok(mmap) => {
209                #[cfg(target_os = "linux")]
210                {
211                    // HUGEPAGE first: must be set before any page faults occur.
212                    // Reduces ~25,600 minor faults (4KB) to ~50 (2MB) for 100MB.
213                    if len >= 2 * 1024 * 1024 {
214                        let _ = mmap.advise(memmap2::Advice::HugePage);
215                    }
216                    // POPULATE_READ (Linux 5.14+): synchronously prefaults all pages
217                    // using huge pages. Falls back to WILLNEED on older kernels.
218                    if len >= 4 * 1024 * 1024 {
219                        if mmap.advise(memmap2::Advice::PopulateRead).is_err() {
220                            let _ = mmap.advise(memmap2::Advice::WillNeed);
221                        }
222                    } else {
223                        let _ = mmap.advise(memmap2::Advice::WillNeed);
224                    }
225                }
226                return Ok(FileData::Mmap(mmap));
227            }
228            Err(_) => {
229                // mmap failed — fall back to read
230                let mut buf = vec![0u8; len as usize];
231                let n = read_full(&mut &file, &mut buf)?;
232                buf.truncate(n);
233                return Ok(FileData::Owned(buf));
234            }
235        }
236    } else if !metadata.file_type().is_file() {
237        // Non-regular file (pipe, FIFO, device, process substitution) — read from open fd.
238        // Pipes report len=0 from stat(), so we must always try to read regardless of len.
239        let mut buf = Vec::new();
240        let mut reader = file;
241        reader.read_to_end(&mut buf)?;
242        Ok(FileData::Owned(buf))
243    } else {
244        Ok(FileData::Owned(Vec::new()))
245    }
246}
247
248/// Read a file always using read() syscall (no mmap).
249/// Faster than mmap for 10MB files: read() handles page faults in-kernel
250/// with batched PTE allocation (~0.5ms), while mmap triggers ~2560
251/// user-space minor faults (~1-2µs each = 2.5-5ms on CI runners).
252pub fn read_file_direct(path: &Path) -> io::Result<FileData> {
253    let file = open_noatime(path)?;
254    let metadata = file.metadata()?;
255    #[cfg(target_os = "linux")]
256    {
257        // Only apply fadvise for files large enough to benefit (>= 64KB)
258        if metadata.len() >= 65536 {
259            use std::os::unix::io::AsRawFd;
260            unsafe {
261                libc::posix_fadvise(file.as_raw_fd(), 0, 0, libc::POSIX_FADV_SEQUENTIAL);
262            }
263        }
264    }
265    let len = metadata.len();
266
267    if len > 0 && metadata.file_type().is_file() {
268        let mut buf = Vec::with_capacity(len as usize);
269        io::Read::read_to_end(&mut &file, &mut buf)?;
270        Ok(FileData::Owned(buf))
271    } else if !metadata.file_type().is_file() {
272        let mut buf = Vec::new();
273        let mut reader = file;
274        reader.read_to_end(&mut buf)?;
275        Ok(FileData::Owned(buf))
276    } else {
277        Ok(FileData::Owned(Vec::new()))
278    }
279}
280
281/// Get file size without reading it (for byte-count-only optimization).
282pub fn file_size(path: &Path) -> io::Result<u64> {
283    Ok(fs::metadata(path)?.len())
284}
285
286/// Read all bytes from stdin into a Vec.
287/// On Linux, uses raw libc::read() to bypass Rust's StdinLock/BufReader overhead.
288/// Uses a direct read() loop into a pre-allocated buffer instead of read_to_end(),
289/// which avoids Vec's grow-and-probe pattern (extra read() calls and memcpy).
290/// Callers should enlarge the pipe buffer via fcntl(F_SETPIPE_SZ) before calling.
291/// Uses the full spare capacity for each read() to minimize syscalls.
292pub fn read_stdin() -> io::Result<Vec<u8>> {
293    #[cfg(target_os = "linux")]
294    return read_stdin_raw();
295
296    #[cfg(not(target_os = "linux"))]
297    read_stdin_generic()
298}
299
300/// Raw libc::read() implementation for Linux — bypasses Rust's StdinLock
301/// and BufReader layers entirely. StdinLock uses an internal 8KB BufReader
302/// which adds an extra memcpy for every read; raw read() goes directly
303/// from the kernel pipe buffer to our Vec.
304///
305/// Pre-allocates 16MB to cover most workloads (benchmark = 10MB) without
306/// over-allocating. For inputs > 16MB, doubles capacity on demand.
307/// Each read() uses the full spare capacity to maximize bytes per syscall.
308///
309/// Note: callers (ftac, ftr, fbase64) are expected to enlarge the pipe
310/// buffer via fcntl(F_SETPIPE_SZ) before calling this function. We don't
311/// do it here to avoid accidentally shrinking a previously enlarged pipe.
312#[cfg(target_os = "linux")]
313fn read_stdin_raw() -> io::Result<Vec<u8>> {
314    const PREALLOC: usize = 16 * 1024 * 1024;
315
316    let mut buf: Vec<u8> = Vec::with_capacity(PREALLOC);
317
318    loop {
319        let spare_cap = buf.capacity() - buf.len();
320        if spare_cap < 1024 * 1024 {
321            // Grow by doubling (or at least 64MB) to minimize realloc count
322            let new_cap = (buf.capacity() * 2).max(buf.len() + PREALLOC);
323            buf.reserve(new_cap - buf.capacity());
324        }
325        let spare_cap = buf.capacity() - buf.len();
326        let start = buf.len();
327
328        // SAFETY: we read into the uninitialized spare capacity and extend
329        // set_len only by the number of bytes actually read.
330        let ret = unsafe {
331            libc::read(
332                0,
333                buf.as_mut_ptr().add(start) as *mut libc::c_void,
334                spare_cap,
335            )
336        };
337        if ret < 0 {
338            let err = io::Error::last_os_error();
339            if err.kind() == io::ErrorKind::Interrupted {
340                continue;
341            }
342            return Err(err);
343        }
344        if ret == 0 {
345            break;
346        }
347        unsafe { buf.set_len(start + ret as usize) };
348    }
349
350    Ok(buf)
351}
352
353/// Splice piped stdin to a memfd, then mmap for zero-copy access.
354/// Uses splice(2) to move data from the stdin pipe directly into a memfd's
355/// page cache (kernel→kernel, no userspace copy). Returns a mutable mmap.
356/// Returns None if stdin is not a pipe or splice fails.
357///
358/// For translate operations: caller can modify the mmap'd data in-place.
359/// For filter operations (delete, cut): caller reads from the mmap.
360#[cfg(target_os = "linux")]
361pub fn splice_stdin_to_mmap() -> io::Result<Option<memmap2::MmapMut>> {
362    use std::os::unix::io::FromRawFd;
363
364    // Check if stdin is a pipe
365    let mut stat: libc::stat = unsafe { std::mem::zeroed() };
366    if unsafe { libc::fstat(0, &mut stat) } != 0 {
367        return Ok(None);
368    }
369    if (stat.st_mode & libc::S_IFMT) != libc::S_IFIFO {
370        return Ok(None);
371    }
372
373    // Create memfd for receiving spliced data.
374    // Use raw syscall to avoid glibc version dependency (memfd_create added in glibc 2.27,
375    // but the syscall works on any kernel >= 3.17). This fixes cross-compilation to
376    // aarch64-unknown-linux-gnu with older sysroots.
377    let memfd =
378        unsafe { libc::syscall(libc::SYS_memfd_create, c"stdin_splice".as_ptr(), 0u32) as i32 };
379    if memfd < 0 {
380        return Ok(None); // memfd_create not supported, fallback
381    }
382
383    // Splice all data from stdin pipe to memfd (zero-copy: kernel moves pipe pages)
384    let mut total: usize = 0;
385    loop {
386        let n = unsafe {
387            libc::splice(
388                0,
389                std::ptr::null_mut(),
390                memfd,
391                std::ptr::null_mut(),
392                // Splice up to 1GB at a time (kernel will limit to actual pipe data)
393                1024 * 1024 * 1024,
394                libc::SPLICE_F_MOVE,
395            )
396        };
397        if n > 0 {
398            total += n as usize;
399        } else if n == 0 {
400            break; // EOF
401        } else {
402            let err = io::Error::last_os_error();
403            if err.kind() == io::ErrorKind::Interrupted {
404                continue;
405            }
406            unsafe { libc::close(memfd) };
407            return Ok(None); // splice failed, fallback to read
408        }
409    }
410
411    if total == 0 {
412        unsafe { libc::close(memfd) };
413        return Ok(None);
414    }
415
416    // Truncate memfd to exact data size. splice() may leave the memfd larger than
417    // `total` (page-aligned), and mmap would map the full file including zero padding.
418    // Without ftruncate, callers get a mmap with garbage/zero bytes beyond `total`.
419    if unsafe { libc::ftruncate(memfd, total as libc::off_t) } != 0 {
420        unsafe { libc::close(memfd) };
421        return Ok(None);
422    }
423
424    // Wrap memfd in a File for memmap2 API, then mmap it.
425    // MAP_SHARED allows in-place modification; populate prefaults pages.
426    let file = unsafe { File::from_raw_fd(memfd) };
427    let mmap = unsafe { MmapOptions::new().populate().map_mut(&file) };
428    drop(file); // Close memfd fd (mmap stays valid, kernel holds reference)
429
430    match mmap {
431        Ok(mut mm) => {
432            // Advise kernel for sequential access + hugepages
433            unsafe {
434                libc::madvise(
435                    mm.as_mut_ptr() as *mut libc::c_void,
436                    total,
437                    libc::MADV_SEQUENTIAL,
438                );
439                if total >= 2 * 1024 * 1024 {
440                    libc::madvise(
441                        mm.as_mut_ptr() as *mut libc::c_void,
442                        total,
443                        libc::MADV_HUGEPAGE,
444                    );
445                }
446            }
447            Ok(Some(mm))
448        }
449        Err(_) => Ok(None),
450    }
451}
452
453/// Generic read_stdin for non-Linux platforms.
454#[cfg(not(target_os = "linux"))]
455fn read_stdin_generic() -> io::Result<Vec<u8>> {
456    const PREALLOC: usize = 16 * 1024 * 1024;
457    const READ_BUF: usize = 4 * 1024 * 1024;
458
459    let mut stdin = io::stdin().lock();
460    let mut buf: Vec<u8> = Vec::with_capacity(PREALLOC);
461
462    loop {
463        let spare_cap = buf.capacity() - buf.len();
464        if spare_cap < READ_BUF {
465            buf.reserve(PREALLOC);
466        }
467        let spare_cap = buf.capacity() - buf.len();
468
469        let start = buf.len();
470        unsafe { buf.set_len(start + spare_cap) };
471        match stdin.read(&mut buf[start..start + spare_cap]) {
472            Ok(0) => {
473                buf.truncate(start);
474                break;
475            }
476            Ok(n) => {
477                buf.truncate(start + n);
478            }
479            Err(e) if e.kind() == io::ErrorKind::Interrupted => {
480                buf.truncate(start);
481                continue;
482            }
483            Err(e) => return Err(e),
484        }
485    }
486
487    Ok(buf)
488}
489
490/// Read as many bytes as possible into buf, retrying on partial reads.
491/// Ensures the full buffer is filled (or EOF reached), avoiding the
492/// probe-read overhead of read_to_end.
493/// Fast path: regular file reads usually return the full buffer on the first call.
494#[inline]
495pub fn read_full(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
496    // Fast path: first read() usually fills the entire buffer for regular files
497    let n = reader.read(buf)?;
498    if n == buf.len() || n == 0 {
499        return Ok(n);
500    }
501    // Slow path: partial read — retry to fill buffer (pipes, slow devices)
502    let mut total = n;
503    while total < buf.len() {
504        match reader.read(&mut buf[total..]) {
505            Ok(0) => break,
506            Ok(n) => total += n,
507            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
508            Err(e) => return Err(e),
509        }
510    }
511    Ok(total)
512}
513
514/// Try to mmap stdin when it's a regular file (shell redirect `< file`).
515/// Returns None if stdin is a pipe/terminal or file is too small.
516/// Only mmaps files >= `min_size` bytes to avoid mmap setup/teardown overhead.
517#[cfg(unix)]
518pub fn try_mmap_stdin(min_size: u64) -> Option<Mmap> {
519    try_mmap_stdin_with_hints(min_size, true)
520}
521
522/// Try to mmap stdin if it's a regular file above `min_size`.
523/// When `sequential` is true, applies MADV_SEQUENTIAL (forward read).
524/// When false, skips MADV_SEQUENTIAL (for tools like tac that read backward).
525/// MADV_HUGEPAGE is always applied for large mappings.
526#[cfg(unix)]
527pub fn try_mmap_stdin_with_hints(min_size: u64, sequential: bool) -> Option<Mmap> {
528    use std::os::unix::io::{AsRawFd, FromRawFd};
529    let stdin = std::io::stdin();
530    let fd = stdin.as_raw_fd();
531
532    let mut stat: libc::stat = unsafe { std::mem::zeroed() };
533    if unsafe { libc::fstat(fd, &mut stat) } != 0 {
534        return None;
535    }
536    if (stat.st_mode & libc::S_IFMT) != libc::S_IFREG || stat.st_size <= 0 {
537        return None;
538    }
539    if (stat.st_size as u64) < min_size {
540        return None;
541    }
542
543    let file = unsafe { std::fs::File::from_raw_fd(fd) };
544    let mmap = unsafe { MmapOptions::new().map(&file) }.ok();
545    std::mem::forget(file); // Don't close stdin
546    #[cfg(target_os = "linux")]
547    if let Some(ref m) = mmap {
548        unsafe {
549            // HUGEPAGE first (before any page faults trigger 4KB allocation)
550            if m.len() >= 2 * 1024 * 1024 {
551                libc::madvise(
552                    m.as_ptr() as *mut libc::c_void,
553                    m.len(),
554                    libc::MADV_HUGEPAGE,
555                );
556            }
557            if sequential {
558                libc::madvise(
559                    m.as_ptr() as *mut libc::c_void,
560                    m.len(),
561                    libc::MADV_SEQUENTIAL,
562                );
563            }
564            // Async readahead hint — triggers kernel prefetch without blocking.
565            // Only for >= 4MB: smaller regions are covered by sequential readahead.
566            // MADV_POPULATE_READ (synchronous prefault) was considered but adds
567            // ~10ms startup latency for 100MB (~20% of total tr time), which
568            // exceeds the benefit of avoiding per-page minor faults.
569            if m.len() >= 4 * 1024 * 1024 {
570                libc::madvise(
571                    m.as_ptr() as *mut libc::c_void,
572                    m.len(),
573                    libc::MADV_WILLNEED,
574                );
575            }
576        }
577    }
578    mmap
579}