Skip to main content

coreutils_rs/common/
io.rs

1use std::fs::{self, File};
2use std::io::{self, Read};
3use std::ops::Deref;
4use std::path::Path;
5
6#[cfg(target_os = "linux")]
7use std::sync::atomic::{AtomicBool, Ordering};
8
9use memmap2::{Mmap, MmapOptions};
10
11/// Holds file data — either zero-copy mmap or an owned Vec.
12/// Dereferences to `&[u8]` for transparent use.
13pub enum FileData {
14    Mmap(Mmap),
15    Owned(Vec<u8>),
16}
17
18impl Deref for FileData {
19    type Target = [u8];
20
21    fn deref(&self) -> &[u8] {
22        match self {
23            FileData::Mmap(m) => m,
24            FileData::Owned(v) => v,
25        }
26    }
27}
28
29/// Threshold below which we use read() instead of mmap.
30/// For files under 1MB, read() is faster since mmap has setup/teardown overhead
31/// (page table creation for up to 256 pages, TLB flush on munmap) that exceeds
32/// the zero-copy benefit.
33const MMAP_THRESHOLD: u64 = 1024 * 1024;
34
35/// Track whether O_NOATIME is supported to avoid repeated failed open() attempts.
36/// After the first EPERM, we never try O_NOATIME again (saves one syscall per file).
37#[cfg(target_os = "linux")]
38static NOATIME_SUPPORTED: AtomicBool = AtomicBool::new(true);
39
40/// Open a file with O_NOATIME on Linux to avoid atime inode writes.
41/// Caches whether O_NOATIME works to avoid double-open on every file.
42#[cfg(target_os = "linux")]
43pub fn open_noatime(path: &Path) -> io::Result<File> {
44    use std::os::unix::fs::OpenOptionsExt;
45    if NOATIME_SUPPORTED.load(Ordering::Relaxed) {
46        match fs::OpenOptions::new()
47            .read(true)
48            .custom_flags(libc::O_NOATIME)
49            .open(path)
50        {
51            Ok(f) => return Ok(f),
52            Err(ref e) if e.raw_os_error() == Some(libc::EPERM) => {
53                // O_NOATIME requires file ownership or CAP_FOWNER — disable globally
54                NOATIME_SUPPORTED.store(false, Ordering::Relaxed);
55            }
56            Err(e) => return Err(e), // Real error, propagate
57        }
58    }
59    File::open(path)
60}
61
62#[cfg(not(target_os = "linux"))]
63pub fn open_noatime(path: &Path) -> io::Result<File> {
64    File::open(path)
65}
66
67/// Read a file with zero-copy mmap for large files or read() for small files.
68/// Opens once with O_NOATIME, uses fstat for metadata to save a syscall.
69pub fn read_file(path: &Path) -> io::Result<FileData> {
70    let file = open_noatime(path)?;
71    let metadata = file.metadata()?;
72    let len = metadata.len();
73
74    if len > 0 && metadata.file_type().is_file() {
75        // Small files: exact-size read from already-open fd.
76        // Uses read_full into pre-sized buffer instead of read_to_end,
77        // which avoids the grow-and-probe pattern (saves 1-2 extra read() syscalls).
78        if len < MMAP_THRESHOLD {
79            let mut buf = vec![0u8; len as usize];
80            let n = read_full(&mut &file, &mut buf)?;
81            buf.truncate(n);
82            return Ok(FileData::Owned(buf));
83        }
84
85        // SAFETY: Read-only mapping. No MAP_POPULATE — it synchronously faults
86        // all pages with 4KB before MADV_HUGEPAGE can take effect, causing ~25,600
87        // minor page faults for 100MB (~12.5ms overhead). Without it, HUGEPAGE hint
88        // is set first, then POPULATE_READ prefaults using 2MB pages (~50 faults).
89        match unsafe { MmapOptions::new().map(&file) } {
90            Ok(mmap) => {
91                #[cfg(target_os = "linux")]
92                {
93                    // HUGEPAGE MUST come first: reduces 25,600 minor faults (4KB) to
94                    // ~50 faults (2MB) for 100MB files. Saves ~12ms of page fault overhead.
95                    if len >= 2 * 1024 * 1024 {
96                        let _ = mmap.advise(memmap2::Advice::HugePage);
97                    }
98                    let _ = mmap.advise(memmap2::Advice::Sequential);
99                    // POPULATE_READ (5.14+): prefault with huge pages. Fall back to WillNeed.
100                    if len >= 4 * 1024 * 1024 {
101                        if mmap.advise(memmap2::Advice::PopulateRead).is_err() {
102                            let _ = mmap.advise(memmap2::Advice::WillNeed);
103                        }
104                    } else {
105                        let _ = mmap.advise(memmap2::Advice::WillNeed);
106                    }
107                }
108                Ok(FileData::Mmap(mmap))
109            }
110            Err(_) => {
111                // mmap failed — fall back to read
112                let mut buf = Vec::with_capacity(len as usize);
113                let mut reader = file;
114                reader.read_to_end(&mut buf)?;
115                Ok(FileData::Owned(buf))
116            }
117        }
118    } else if !metadata.file_type().is_file() {
119        // Non-regular file (pipe, FIFO, device, process substitution) — read from open fd.
120        // Pipes report len=0 from stat(), so we must always try to read regardless of len.
121        let mut buf = Vec::new();
122        let mut reader = file;
123        reader.read_to_end(&mut buf)?;
124        Ok(FileData::Owned(buf))
125    } else {
126        Ok(FileData::Owned(Vec::new()))
127    }
128}
129
130/// Read a file entirely into a mutable Vec.
131/// Uses exact-size allocation from fstat + single read() for efficiency.
132/// Preferred over mmap when the caller needs mutable access (e.g., in-place decode).
133pub fn read_file_vec(path: &Path) -> io::Result<Vec<u8>> {
134    let file = open_noatime(path)?;
135    let metadata = file.metadata()?;
136    let len = metadata.len() as usize;
137    if len == 0 {
138        return Ok(Vec::new());
139    }
140    let mut buf = vec![0u8; len];
141    let n = read_full(&mut &file, &mut buf)?;
142    buf.truncate(n);
143    Ok(buf)
144}
145
146/// Read a file always using mmap, with optimal page fault strategy.
147/// Used by tac for zero-copy output and parallel scanning.
148///
149/// Strategy: mmap WITHOUT MAP_POPULATE, then MADV_HUGEPAGE + MADV_POPULATE_READ.
150/// MAP_POPULATE synchronously faults all pages with 4KB BEFORE MADV_HUGEPAGE
151/// can take effect, causing ~25,600 minor faults for 100MB (~12.5ms overhead).
152/// MADV_POPULATE_READ (Linux 5.14+) prefaults pages AFTER HUGEPAGE is set,
153/// using 2MB huge pages (~50 faults = ~0.1ms). Falls back to WILLNEED on
154/// older kernels.
155pub fn read_file_mmap(path: &Path) -> io::Result<FileData> {
156    let file = open_noatime(path)?;
157    let metadata = file.metadata()?;
158    let len = metadata.len();
159
160    if len > 0 && metadata.file_type().is_file() {
161        // No MAP_POPULATE: let MADV_HUGEPAGE take effect before page faults.
162        let mmap_result = unsafe { MmapOptions::new().map(&file) };
163        match mmap_result {
164            Ok(mmap) => {
165                #[cfg(target_os = "linux")]
166                {
167                    // HUGEPAGE first: must be set before any page faults occur.
168                    // Reduces ~25,600 minor faults (4KB) to ~50 (2MB) for 100MB.
169                    if len >= 2 * 1024 * 1024 {
170                        let _ = mmap.advise(memmap2::Advice::HugePage);
171                    }
172                    // POPULATE_READ (Linux 5.14+): synchronously prefaults all pages
173                    // using huge pages. Falls back to WILLNEED on older kernels.
174                    if len >= 4 * 1024 * 1024 {
175                        if mmap.advise(memmap2::Advice::PopulateRead).is_err() {
176                            let _ = mmap.advise(memmap2::Advice::WillNeed);
177                        }
178                    } else {
179                        let _ = mmap.advise(memmap2::Advice::WillNeed);
180                    }
181                }
182                return Ok(FileData::Mmap(mmap));
183            }
184            Err(_) => {
185                // mmap failed — fall back to read
186                let mut buf = vec![0u8; len as usize];
187                let n = read_full(&mut &file, &mut buf)?;
188                buf.truncate(n);
189                return Ok(FileData::Owned(buf));
190            }
191        }
192    } else if !metadata.file_type().is_file() {
193        // Non-regular file (pipe, FIFO, device, process substitution) — read from open fd.
194        // Pipes report len=0 from stat(), so we must always try to read regardless of len.
195        let mut buf = Vec::new();
196        let mut reader = file;
197        reader.read_to_end(&mut buf)?;
198        Ok(FileData::Owned(buf))
199    } else {
200        Ok(FileData::Owned(Vec::new()))
201    }
202}
203
204/// Read a file always using read() syscall (no mmap).
205/// Faster than mmap for 10MB files: read() handles page faults in-kernel
206/// with batched PTE allocation (~0.5ms), while mmap triggers ~2560
207/// user-space minor faults (~1-2µs each = 2.5-5ms on CI runners).
208pub fn read_file_direct(path: &Path) -> io::Result<FileData> {
209    let file = open_noatime(path)?;
210    let metadata = file.metadata()?;
211    #[cfg(target_os = "linux")]
212    {
213        // Only apply fadvise for files large enough to benefit (>= 64KB)
214        if metadata.len() >= 65536 {
215            use std::os::unix::io::AsRawFd;
216            unsafe {
217                libc::posix_fadvise(file.as_raw_fd(), 0, 0, libc::POSIX_FADV_SEQUENTIAL);
218            }
219        }
220    }
221    let len = metadata.len();
222
223    if len > 0 && metadata.file_type().is_file() {
224        let mut buf = Vec::with_capacity(len as usize);
225        io::Read::read_to_end(&mut &file, &mut buf)?;
226        Ok(FileData::Owned(buf))
227    } else if !metadata.file_type().is_file() {
228        let mut buf = Vec::new();
229        let mut reader = file;
230        reader.read_to_end(&mut buf)?;
231        Ok(FileData::Owned(buf))
232    } else {
233        Ok(FileData::Owned(Vec::new()))
234    }
235}
236
237/// Get file size without reading it (for byte-count-only optimization).
238pub fn file_size(path: &Path) -> io::Result<u64> {
239    Ok(fs::metadata(path)?.len())
240}
241
242/// Read all bytes from stdin into a Vec.
243/// On Linux, uses raw libc::read() to bypass Rust's StdinLock/BufReader overhead.
244/// Uses a direct read() loop into a pre-allocated buffer instead of read_to_end(),
245/// which avoids Vec's grow-and-probe pattern (extra read() calls and memcpy).
246/// Callers should enlarge the pipe buffer via fcntl(F_SETPIPE_SZ) before calling.
247/// Uses the full spare capacity for each read() to minimize syscalls.
248pub fn read_stdin() -> io::Result<Vec<u8>> {
249    #[cfg(target_os = "linux")]
250    return read_stdin_raw();
251
252    #[cfg(not(target_os = "linux"))]
253    read_stdin_generic()
254}
255
256/// Raw libc::read() implementation for Linux — bypasses Rust's StdinLock
257/// and BufReader layers entirely. StdinLock uses an internal 8KB BufReader
258/// which adds an extra memcpy for every read; raw read() goes directly
259/// from the kernel pipe buffer to our Vec.
260///
261/// Pre-allocates 16MB to cover most workloads (benchmark = 10MB) without
262/// over-allocating. For inputs > 16MB, doubles capacity on demand.
263/// Each read() uses the full spare capacity to maximize bytes per syscall.
264///
265/// Note: callers (ftac, ftr, fbase64) are expected to enlarge the pipe
266/// buffer via fcntl(F_SETPIPE_SZ) before calling this function. We don't
267/// do it here to avoid accidentally shrinking a previously enlarged pipe.
268#[cfg(target_os = "linux")]
269fn read_stdin_raw() -> io::Result<Vec<u8>> {
270    const PREALLOC: usize = 16 * 1024 * 1024;
271
272    let mut buf: Vec<u8> = Vec::with_capacity(PREALLOC);
273
274    loop {
275        let spare_cap = buf.capacity() - buf.len();
276        if spare_cap < 1024 * 1024 {
277            // Grow by doubling (or at least 64MB) to minimize realloc count
278            let new_cap = (buf.capacity() * 2).max(buf.len() + PREALLOC);
279            buf.reserve(new_cap - buf.capacity());
280        }
281        let spare_cap = buf.capacity() - buf.len();
282        let start = buf.len();
283
284        // SAFETY: we read into the uninitialized spare capacity and extend
285        // set_len only by the number of bytes actually read.
286        let ret = unsafe {
287            libc::read(
288                0,
289                buf.as_mut_ptr().add(start) as *mut libc::c_void,
290                spare_cap,
291            )
292        };
293        if ret < 0 {
294            let err = io::Error::last_os_error();
295            if err.kind() == io::ErrorKind::Interrupted {
296                continue;
297            }
298            return Err(err);
299        }
300        if ret == 0 {
301            break;
302        }
303        unsafe { buf.set_len(start + ret as usize) };
304    }
305
306    Ok(buf)
307}
308
309/// Splice piped stdin to a memfd, then mmap for zero-copy access.
310/// Uses splice(2) to move data from the stdin pipe directly into a memfd's
311/// page cache (kernel→kernel, no userspace copy). Returns a mutable mmap.
312/// Returns None if stdin is not a pipe or splice fails.
313///
314/// For translate operations: caller can modify the mmap'd data in-place.
315/// For filter operations (delete, cut): caller reads from the mmap.
316#[cfg(target_os = "linux")]
317pub fn splice_stdin_to_mmap() -> io::Result<Option<memmap2::MmapMut>> {
318    use std::os::unix::io::FromRawFd;
319
320    // Check if stdin is a pipe
321    let mut stat: libc::stat = unsafe { std::mem::zeroed() };
322    if unsafe { libc::fstat(0, &mut stat) } != 0 {
323        return Ok(None);
324    }
325    if (stat.st_mode & libc::S_IFMT) != libc::S_IFIFO {
326        return Ok(None);
327    }
328
329    // Create memfd for receiving spliced data.
330    // Use raw syscall to avoid glibc version dependency (memfd_create added in glibc 2.27,
331    // but the syscall works on any kernel >= 3.17). This fixes cross-compilation to
332    // aarch64-unknown-linux-gnu with older sysroots.
333    let memfd =
334        unsafe { libc::syscall(libc::SYS_memfd_create, c"stdin_splice".as_ptr(), 0u32) as i32 };
335    if memfd < 0 {
336        return Ok(None); // memfd_create not supported, fallback
337    }
338
339    // Splice all data from stdin pipe to memfd (zero-copy: kernel moves pipe pages)
340    let mut total: usize = 0;
341    loop {
342        let n = unsafe {
343            libc::splice(
344                0,
345                std::ptr::null_mut(),
346                memfd,
347                std::ptr::null_mut(),
348                // Splice up to 1GB at a time (kernel will limit to actual pipe data)
349                1024 * 1024 * 1024,
350                libc::SPLICE_F_MOVE,
351            )
352        };
353        if n > 0 {
354            total += n as usize;
355        } else if n == 0 {
356            break; // EOF
357        } else {
358            let err = io::Error::last_os_error();
359            if err.kind() == io::ErrorKind::Interrupted {
360                continue;
361            }
362            unsafe { libc::close(memfd) };
363            return Ok(None); // splice failed, fallback to read
364        }
365    }
366
367    if total == 0 {
368        unsafe { libc::close(memfd) };
369        return Ok(None);
370    }
371
372    // Truncate memfd to exact data size. splice() may leave the memfd larger than
373    // `total` (page-aligned), and mmap would map the full file including zero padding.
374    // Without ftruncate, callers get a mmap with garbage/zero bytes beyond `total`.
375    if unsafe { libc::ftruncate(memfd, total as libc::off_t) } != 0 {
376        unsafe { libc::close(memfd) };
377        return Ok(None);
378    }
379
380    // Wrap memfd in a File for memmap2 API, then mmap it.
381    // MAP_SHARED allows in-place modification; populate prefaults pages.
382    let file = unsafe { File::from_raw_fd(memfd) };
383    let mmap = unsafe { MmapOptions::new().populate().map_mut(&file) };
384    drop(file); // Close memfd fd (mmap stays valid, kernel holds reference)
385
386    match mmap {
387        Ok(mut mm) => {
388            // Advise kernel for sequential access + hugepages
389            unsafe {
390                libc::madvise(
391                    mm.as_mut_ptr() as *mut libc::c_void,
392                    total,
393                    libc::MADV_SEQUENTIAL,
394                );
395                if total >= 2 * 1024 * 1024 {
396                    libc::madvise(
397                        mm.as_mut_ptr() as *mut libc::c_void,
398                        total,
399                        libc::MADV_HUGEPAGE,
400                    );
401                }
402            }
403            Ok(Some(mm))
404        }
405        Err(_) => Ok(None),
406    }
407}
408
409/// Generic read_stdin for non-Linux platforms.
410#[cfg(not(target_os = "linux"))]
411fn read_stdin_generic() -> io::Result<Vec<u8>> {
412    const PREALLOC: usize = 16 * 1024 * 1024;
413    const READ_BUF: usize = 4 * 1024 * 1024;
414
415    let mut stdin = io::stdin().lock();
416    let mut buf: Vec<u8> = Vec::with_capacity(PREALLOC);
417
418    loop {
419        let spare_cap = buf.capacity() - buf.len();
420        if spare_cap < READ_BUF {
421            buf.reserve(PREALLOC);
422        }
423        let spare_cap = buf.capacity() - buf.len();
424
425        let start = buf.len();
426        unsafe { buf.set_len(start + spare_cap) };
427        match stdin.read(&mut buf[start..start + spare_cap]) {
428            Ok(0) => {
429                buf.truncate(start);
430                break;
431            }
432            Ok(n) => {
433                buf.truncate(start + n);
434            }
435            Err(e) if e.kind() == io::ErrorKind::Interrupted => {
436                buf.truncate(start);
437                continue;
438            }
439            Err(e) => return Err(e),
440        }
441    }
442
443    Ok(buf)
444}
445
446/// Read as many bytes as possible into buf, retrying on partial reads.
447/// Ensures the full buffer is filled (or EOF reached), avoiding the
448/// probe-read overhead of read_to_end.
449/// Fast path: regular file reads usually return the full buffer on the first call.
450#[inline]
451fn read_full(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
452    // Fast path: first read() usually fills the entire buffer for regular files
453    let n = reader.read(buf)?;
454    if n == buf.len() || n == 0 {
455        return Ok(n);
456    }
457    // Slow path: partial read — retry to fill buffer (pipes, slow devices)
458    let mut total = n;
459    while total < buf.len() {
460        match reader.read(&mut buf[total..]) {
461            Ok(0) => break,
462            Ok(n) => total += n,
463            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
464            Err(e) => return Err(e),
465        }
466    }
467    Ok(total)
468}