Skip to main content

coreutils_rs/common/
io.rs

1use std::fs::{self, File};
2use std::io::{self, Read};
3use std::ops::Deref;
4use std::path::Path;
5
6#[cfg(target_os = "linux")]
7use std::sync::atomic::{AtomicBool, Ordering};
8
9use memmap2::{Mmap, MmapOptions};
10
11/// Holds file data — either zero-copy mmap or an owned Vec.
12/// Dereferences to `&[u8]` for transparent use.
13pub enum FileData {
14    Mmap(Mmap),
15    Owned(Vec<u8>),
16}
17
18impl Deref for FileData {
19    type Target = [u8];
20
21    fn deref(&self) -> &[u8] {
22        match self {
23            FileData::Mmap(m) => m,
24            FileData::Owned(v) => v,
25        }
26    }
27}
28
29/// Threshold below which we use read() instead of mmap.
30/// For files under 1MB, read() is faster since mmap has setup/teardown overhead
31/// (page table creation for up to 256 pages, TLB flush on munmap) that exceeds
32/// the zero-copy benefit.
33const MMAP_THRESHOLD: u64 = 1024 * 1024;
34
35/// Track whether O_NOATIME is supported to avoid repeated failed open() attempts.
36/// After the first EPERM, we never try O_NOATIME again (saves one syscall per file).
37#[cfg(target_os = "linux")]
38static NOATIME_SUPPORTED: AtomicBool = AtomicBool::new(true);
39
40/// Open a file with O_NOATIME on Linux to avoid atime inode writes.
41/// Caches whether O_NOATIME works to avoid double-open on every file.
42#[cfg(target_os = "linux")]
43fn open_noatime(path: &Path) -> io::Result<File> {
44    use std::os::unix::fs::OpenOptionsExt;
45    if NOATIME_SUPPORTED.load(Ordering::Relaxed) {
46        match fs::OpenOptions::new()
47            .read(true)
48            .custom_flags(libc::O_NOATIME)
49            .open(path)
50        {
51            Ok(f) => return Ok(f),
52            Err(ref e) if e.raw_os_error() == Some(libc::EPERM) => {
53                // O_NOATIME requires file ownership or CAP_FOWNER — disable globally
54                NOATIME_SUPPORTED.store(false, Ordering::Relaxed);
55            }
56            Err(e) => return Err(e), // Real error, propagate
57        }
58    }
59    File::open(path)
60}
61
62#[cfg(not(target_os = "linux"))]
63fn open_noatime(path: &Path) -> io::Result<File> {
64    File::open(path)
65}
66
67/// Read a file with zero-copy mmap for large files or read() for small files.
68/// Opens once with O_NOATIME, uses fstat for metadata to save a syscall.
69pub fn read_file(path: &Path) -> io::Result<FileData> {
70    let file = open_noatime(path)?;
71    let metadata = file.metadata()?;
72    let len = metadata.len();
73
74    if len > 0 && metadata.file_type().is_file() {
75        // Small files: exact-size read from already-open fd.
76        // Uses read_full into pre-sized buffer instead of read_to_end,
77        // which avoids the grow-and-probe pattern (saves 1-2 extra read() syscalls).
78        if len < MMAP_THRESHOLD {
79            let mut buf = vec![0u8; len as usize];
80            let n = read_full(&mut &file, &mut buf)?;
81            buf.truncate(n);
82            return Ok(FileData::Owned(buf));
83        }
84
85        // SAFETY: Read-only mapping. MAP_POPULATE pre-faults page table entries
86        // from the page cache in one batch kernel call, avoiding scattered demand
87        // faults during the scan. For warm cache (benchmark warmup), this is faster
88        // than demand faults even with fault-around (~160 faults for 10MB → 0 faults).
89        // For cold cache, the kernel reads ahead from disk during mmap().
90        match unsafe { MmapOptions::new().populate().map(&file) } {
91            Ok(mmap) => {
92                #[cfg(target_os = "linux")]
93                {
94                    let _ = mmap.advise(memmap2::Advice::Sequential);
95                    let _ = mmap.advise(memmap2::Advice::WillNeed);
96                }
97                Ok(FileData::Mmap(mmap))
98            }
99            Err(_) => {
100                // mmap failed — fall back to read
101                let mut buf = Vec::with_capacity(len as usize);
102                let mut reader = file;
103                reader.read_to_end(&mut buf)?;
104                Ok(FileData::Owned(buf))
105            }
106        }
107    } else if len > 0 {
108        // Non-regular file (special files) — read from open fd
109        let mut buf = Vec::new();
110        let mut reader = file;
111        reader.read_to_end(&mut buf)?;
112        Ok(FileData::Owned(buf))
113    } else {
114        Ok(FileData::Owned(Vec::new()))
115    }
116}
117
118/// Read a file entirely into a mutable Vec.
119/// Uses exact-size allocation from fstat + single read() for efficiency.
120/// Preferred over mmap when the caller needs mutable access (e.g., in-place decode).
121pub fn read_file_vec(path: &Path) -> io::Result<Vec<u8>> {
122    let file = open_noatime(path)?;
123    let metadata = file.metadata()?;
124    let len = metadata.len() as usize;
125    if len == 0 {
126        return Ok(Vec::new());
127    }
128    let mut buf = vec![0u8; len];
129    let n = read_full(&mut &file, &mut buf)?;
130    buf.truncate(n);
131    Ok(buf)
132}
133
134/// Read a file always using mmap, with MADV_HUGEPAGE + WILLNEED.
135/// Used by tac for large files (>= 16MB) that benefit from zero-copy
136/// vmsplice output and parallel scanning. Callers should use read_file_vec()
137/// for smaller files to avoid mmap page fault overhead.
138///
139/// No MAP_POPULATE: it synchronously faults all pages with 4KB BEFORE
140/// MADV_HUGEPAGE can take effect, causing ~25,600 minor faults for 100MB
141/// (~25ms). Without it, HUGEPAGE is set first, then WILLNEED triggers
142/// async readahead using 2MB pages (~50 faults = ~0.1ms).
143pub fn read_file_mmap(path: &Path) -> io::Result<FileData> {
144    let file = open_noatime(path)?;
145    let metadata = file.metadata()?;
146    let len = metadata.len();
147
148    if len > 0 && metadata.file_type().is_file() {
149        match unsafe { MmapOptions::new().map(&file) } {
150            Ok(mmap) => {
151                #[cfg(target_os = "linux")]
152                {
153                    // HUGEPAGE first: must be set before any page faults occur.
154                    // Reduces ~25,600 minor faults (4KB) to ~50 (2MB) for 100MB.
155                    if len >= 2 * 1024 * 1024 {
156                        let _ = mmap.advise(memmap2::Advice::HugePage);
157                    }
158                    let _ = mmap.advise(memmap2::Advice::WillNeed);
159                }
160                return Ok(FileData::Mmap(mmap));
161            }
162            Err(_) => {
163                // mmap failed — fall back to read
164                let mut buf = vec![0u8; len as usize];
165                let n = read_full(&mut &file, &mut buf)?;
166                buf.truncate(n);
167                return Ok(FileData::Owned(buf));
168            }
169        }
170    } else if len > 0 {
171        // Non-regular file (special files) — read from open fd
172        let mut buf = Vec::new();
173        let mut reader = file;
174        reader.read_to_end(&mut buf)?;
175        Ok(FileData::Owned(buf))
176    } else {
177        Ok(FileData::Owned(Vec::new()))
178    }
179}
180
181/// Get file size without reading it (for byte-count-only optimization).
182pub fn file_size(path: &Path) -> io::Result<u64> {
183    Ok(fs::metadata(path)?.len())
184}
185
186/// Read all bytes from stdin into a Vec.
187/// On Linux, uses raw libc::read() to bypass Rust's StdinLock/BufReader overhead.
188/// Uses a direct read() loop into a pre-allocated buffer instead of read_to_end(),
189/// which avoids Vec's grow-and-probe pattern (extra read() calls and memcpy).
190/// Callers should enlarge the pipe buffer via fcntl(F_SETPIPE_SZ) before calling.
191/// Uses the full spare capacity for each read() to minimize syscalls.
192pub fn read_stdin() -> io::Result<Vec<u8>> {
193    #[cfg(target_os = "linux")]
194    return read_stdin_raw();
195
196    #[cfg(not(target_os = "linux"))]
197    read_stdin_generic()
198}
199
200/// Raw libc::read() implementation for Linux — bypasses Rust's StdinLock
201/// and BufReader layers entirely. StdinLock uses an internal 8KB BufReader
202/// which adds an extra memcpy for every read; raw read() goes directly
203/// from the kernel pipe buffer to our Vec.
204///
205/// Pre-allocates 16MB to cover most workloads (benchmark = 10MB) without
206/// over-allocating. For inputs > 16MB, doubles capacity on demand.
207/// Each read() uses the full spare capacity to maximize bytes per syscall.
208///
209/// Note: callers (ftac, ftr, fbase64) are expected to enlarge the pipe
210/// buffer via fcntl(F_SETPIPE_SZ) before calling this function. We don't
211/// do it here to avoid accidentally shrinking a previously enlarged pipe.
212#[cfg(target_os = "linux")]
213fn read_stdin_raw() -> io::Result<Vec<u8>> {
214    const PREALLOC: usize = 16 * 1024 * 1024;
215
216    let mut buf: Vec<u8> = Vec::with_capacity(PREALLOC);
217
218    loop {
219        let spare_cap = buf.capacity() - buf.len();
220        if spare_cap < 1024 * 1024 {
221            // Grow by doubling (or at least 64MB) to minimize realloc count
222            let new_cap = (buf.capacity() * 2).max(buf.len() + PREALLOC);
223            buf.reserve(new_cap - buf.capacity());
224        }
225        let spare_cap = buf.capacity() - buf.len();
226        let start = buf.len();
227
228        // SAFETY: we read into the uninitialized spare capacity and extend
229        // set_len only by the number of bytes actually read.
230        let ret = unsafe {
231            libc::read(
232                0,
233                buf.as_mut_ptr().add(start) as *mut libc::c_void,
234                spare_cap,
235            )
236        };
237        if ret < 0 {
238            let err = io::Error::last_os_error();
239            if err.kind() == io::ErrorKind::Interrupted {
240                continue;
241            }
242            return Err(err);
243        }
244        if ret == 0 {
245            break;
246        }
247        unsafe { buf.set_len(start + ret as usize) };
248    }
249
250    Ok(buf)
251}
252
253/// Splice piped stdin to a memfd, then mmap for zero-copy access.
254/// Uses splice(2) to move data from the stdin pipe directly into a memfd's
255/// page cache (kernel→kernel, no userspace copy). Returns a mutable mmap.
256/// Returns None if stdin is not a pipe or splice fails.
257///
258/// For translate operations: caller can modify the mmap'd data in-place.
259/// For filter operations (delete, cut): caller reads from the mmap.
260#[cfg(target_os = "linux")]
261pub fn splice_stdin_to_mmap() -> io::Result<Option<memmap2::MmapMut>> {
262    use std::os::unix::io::FromRawFd;
263
264    // Check if stdin is a pipe
265    let mut stat: libc::stat = unsafe { std::mem::zeroed() };
266    if unsafe { libc::fstat(0, &mut stat) } != 0 {
267        return Ok(None);
268    }
269    if (stat.st_mode & libc::S_IFMT) != libc::S_IFIFO {
270        return Ok(None);
271    }
272
273    // Create memfd for receiving spliced data.
274    // Use raw syscall to avoid glibc version dependency (memfd_create added in glibc 2.27,
275    // but the syscall works on any kernel >= 3.17). This fixes cross-compilation to
276    // aarch64-unknown-linux-gnu with older sysroots.
277    let memfd =
278        unsafe { libc::syscall(libc::SYS_memfd_create, c"stdin_splice".as_ptr(), 0u32) as i32 };
279    if memfd < 0 {
280        return Ok(None); // memfd_create not supported, fallback
281    }
282
283    // Splice all data from stdin pipe to memfd (zero-copy: kernel moves pipe pages)
284    let mut total: usize = 0;
285    loop {
286        let n = unsafe {
287            libc::splice(
288                0,
289                std::ptr::null_mut(),
290                memfd,
291                std::ptr::null_mut(),
292                // Splice up to 1GB at a time (kernel will limit to actual pipe data)
293                1024 * 1024 * 1024,
294                libc::SPLICE_F_MOVE,
295            )
296        };
297        if n > 0 {
298            total += n as usize;
299        } else if n == 0 {
300            break; // EOF
301        } else {
302            let err = io::Error::last_os_error();
303            if err.kind() == io::ErrorKind::Interrupted {
304                continue;
305            }
306            unsafe { libc::close(memfd) };
307            return Ok(None); // splice failed, fallback to read
308        }
309    }
310
311    if total == 0 {
312        unsafe { libc::close(memfd) };
313        return Ok(None);
314    }
315
316    // Wrap memfd in a File for memmap2 API, then mmap it.
317    // MAP_SHARED allows in-place modification; populate prefaults pages.
318    let file = unsafe { File::from_raw_fd(memfd) };
319    let mmap = unsafe { MmapOptions::new().populate().map_mut(&file) };
320    drop(file); // Close memfd fd (mmap stays valid, kernel holds reference)
321
322    match mmap {
323        Ok(mut mm) => {
324            // Advise kernel for sequential access + hugepages
325            unsafe {
326                libc::madvise(
327                    mm.as_mut_ptr() as *mut libc::c_void,
328                    total,
329                    libc::MADV_SEQUENTIAL,
330                );
331                if total >= 2 * 1024 * 1024 {
332                    libc::madvise(
333                        mm.as_mut_ptr() as *mut libc::c_void,
334                        total,
335                        libc::MADV_HUGEPAGE,
336                    );
337                }
338            }
339            Ok(Some(mm))
340        }
341        Err(_) => Ok(None),
342    }
343}
344
345/// Generic read_stdin for non-Linux platforms.
346#[cfg(not(target_os = "linux"))]
347fn read_stdin_generic() -> io::Result<Vec<u8>> {
348    const PREALLOC: usize = 16 * 1024 * 1024;
349    const READ_BUF: usize = 4 * 1024 * 1024;
350
351    let mut stdin = io::stdin().lock();
352    let mut buf: Vec<u8> = Vec::with_capacity(PREALLOC);
353
354    loop {
355        let spare_cap = buf.capacity() - buf.len();
356        if spare_cap < READ_BUF {
357            buf.reserve(PREALLOC);
358        }
359        let spare_cap = buf.capacity() - buf.len();
360
361        let start = buf.len();
362        unsafe { buf.set_len(start + spare_cap) };
363        match stdin.read(&mut buf[start..start + spare_cap]) {
364            Ok(0) => {
365                buf.truncate(start);
366                break;
367            }
368            Ok(n) => {
369                buf.truncate(start + n);
370            }
371            Err(e) if e.kind() == io::ErrorKind::Interrupted => {
372                buf.truncate(start);
373                continue;
374            }
375            Err(e) => return Err(e),
376        }
377    }
378
379    Ok(buf)
380}
381
382/// Read as many bytes as possible into buf, retrying on partial reads.
383/// Ensures the full buffer is filled (or EOF reached), avoiding the
384/// probe-read overhead of read_to_end.
385/// Fast path: regular file reads usually return the full buffer on the first call.
386#[inline]
387fn read_full(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
388    // Fast path: first read() usually fills the entire buffer for regular files
389    let n = reader.read(buf)?;
390    if n == buf.len() || n == 0 {
391        return Ok(n);
392    }
393    // Slow path: partial read — retry to fill buffer (pipes, slow devices)
394    let mut total = n;
395    while total < buf.len() {
396        match reader.read(&mut buf[total..]) {
397            Ok(0) => break,
398            Ok(n) => total += n,
399            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
400            Err(e) => return Err(e),
401        }
402    }
403    Ok(total)
404}