coreutils_rs/common/io.rs
1use std::fs::{self, File};
2use std::io::{self, Read};
3use std::ops::Deref;
4use std::path::Path;
5
6#[cfg(target_os = "linux")]
7use std::sync::atomic::{AtomicBool, Ordering};
8
9use memmap2::{Mmap, MmapOptions};
10
11/// Holds file data — either zero-copy mmap or an owned Vec.
12/// Dereferences to `&[u8]` for transparent use.
13pub enum FileData {
14 Mmap(Mmap),
15 Owned(Vec<u8>),
16}
17
18impl Deref for FileData {
19 type Target = [u8];
20
21 fn deref(&self) -> &[u8] {
22 match self {
23 FileData::Mmap(m) => m,
24 FileData::Owned(v) => v,
25 }
26 }
27}
28
29/// Threshold below which we use read() instead of mmap.
30/// For files under 1MB, read() is faster since mmap has setup/teardown overhead
31/// (page table creation for up to 256 pages, TLB flush on munmap) that exceeds
32/// the zero-copy benefit.
33const MMAP_THRESHOLD: u64 = 1024 * 1024;
34
35/// Track whether O_NOATIME is supported to avoid repeated failed open() attempts.
36/// After the first EPERM, we never try O_NOATIME again (saves one syscall per file).
37#[cfg(target_os = "linux")]
38static NOATIME_SUPPORTED: AtomicBool = AtomicBool::new(true);
39
40/// Open a file with O_NOATIME on Linux to avoid atime inode writes.
41/// Caches whether O_NOATIME works to avoid double-open on every file.
42#[cfg(target_os = "linux")]
43fn open_noatime(path: &Path) -> io::Result<File> {
44 use std::os::unix::fs::OpenOptionsExt;
45 if NOATIME_SUPPORTED.load(Ordering::Relaxed) {
46 match fs::OpenOptions::new()
47 .read(true)
48 .custom_flags(libc::O_NOATIME)
49 .open(path)
50 {
51 Ok(f) => return Ok(f),
52 Err(ref e) if e.raw_os_error() == Some(libc::EPERM) => {
53 // O_NOATIME requires file ownership or CAP_FOWNER — disable globally
54 NOATIME_SUPPORTED.store(false, Ordering::Relaxed);
55 }
56 Err(e) => return Err(e), // Real error, propagate
57 }
58 }
59 File::open(path)
60}
61
62#[cfg(not(target_os = "linux"))]
63fn open_noatime(path: &Path) -> io::Result<File> {
64 File::open(path)
65}
66
67/// Read a file with zero-copy mmap for large files or read() for small files.
68/// Opens once with O_NOATIME, uses fstat for metadata to save a syscall.
69pub fn read_file(path: &Path) -> io::Result<FileData> {
70 let file = open_noatime(path)?;
71 let metadata = file.metadata()?;
72 let len = metadata.len();
73
74 if len > 0 && metadata.file_type().is_file() {
75 // Small files: exact-size read from already-open fd.
76 // Uses read_full into pre-sized buffer instead of read_to_end,
77 // which avoids the grow-and-probe pattern (saves 1-2 extra read() syscalls).
78 if len < MMAP_THRESHOLD {
79 let mut buf = vec![0u8; len as usize];
80 let n = read_full(&mut &file, &mut buf)?;
81 buf.truncate(n);
82 return Ok(FileData::Owned(buf));
83 }
84
85 // SAFETY: Read-only mapping. No MAP_POPULATE — it synchronously faults
86 // all pages with 4KB before MADV_HUGEPAGE can take effect, causing ~25,600
87 // minor page faults for 100MB (~12.5ms overhead). Without it, HUGEPAGE hint
88 // is set first, then POPULATE_READ prefaults using 2MB pages (~50 faults).
89 match unsafe { MmapOptions::new().map(&file) } {
90 Ok(mmap) => {
91 #[cfg(target_os = "linux")]
92 {
93 // HUGEPAGE MUST come first: reduces 25,600 minor faults (4KB) to
94 // ~50 faults (2MB) for 100MB files. Saves ~12ms of page fault overhead.
95 if len >= 2 * 1024 * 1024 {
96 let _ = mmap.advise(memmap2::Advice::HugePage);
97 }
98 let _ = mmap.advise(memmap2::Advice::Sequential);
99 // POPULATE_READ (5.14+): prefault with huge pages. Fall back to WillNeed.
100 if len >= 4 * 1024 * 1024 {
101 if mmap.advise(memmap2::Advice::PopulateRead).is_err() {
102 let _ = mmap.advise(memmap2::Advice::WillNeed);
103 }
104 } else {
105 let _ = mmap.advise(memmap2::Advice::WillNeed);
106 }
107 }
108 Ok(FileData::Mmap(mmap))
109 }
110 Err(_) => {
111 // mmap failed — fall back to read
112 let mut buf = Vec::with_capacity(len as usize);
113 let mut reader = file;
114 reader.read_to_end(&mut buf)?;
115 Ok(FileData::Owned(buf))
116 }
117 }
118 } else if !metadata.file_type().is_file() {
119 // Non-regular file (pipe, FIFO, device, process substitution) — read from open fd.
120 // Pipes report len=0 from stat(), so we must always try to read regardless of len.
121 let mut buf = Vec::new();
122 let mut reader = file;
123 reader.read_to_end(&mut buf)?;
124 Ok(FileData::Owned(buf))
125 } else {
126 Ok(FileData::Owned(Vec::new()))
127 }
128}
129
130/// Read a file entirely into a mutable Vec.
131/// Uses exact-size allocation from fstat + single read() for efficiency.
132/// Preferred over mmap when the caller needs mutable access (e.g., in-place decode).
133pub fn read_file_vec(path: &Path) -> io::Result<Vec<u8>> {
134 let file = open_noatime(path)?;
135 let metadata = file.metadata()?;
136 let len = metadata.len() as usize;
137 if len == 0 {
138 return Ok(Vec::new());
139 }
140 let mut buf = vec![0u8; len];
141 let n = read_full(&mut &file, &mut buf)?;
142 buf.truncate(n);
143 Ok(buf)
144}
145
146/// Read a file always using mmap, with optimal page fault strategy.
147/// Used by tac for zero-copy output and parallel scanning.
148///
149/// Strategy: mmap WITHOUT MAP_POPULATE, then MADV_HUGEPAGE + MADV_POPULATE_READ.
150/// MAP_POPULATE synchronously faults all pages with 4KB BEFORE MADV_HUGEPAGE
151/// can take effect, causing ~25,600 minor faults for 100MB (~12.5ms overhead).
152/// MADV_POPULATE_READ (Linux 5.14+) prefaults pages AFTER HUGEPAGE is set,
153/// using 2MB huge pages (~50 faults = ~0.1ms). Falls back to WILLNEED on
154/// older kernels.
155pub fn read_file_mmap(path: &Path) -> io::Result<FileData> {
156 let file = open_noatime(path)?;
157 let metadata = file.metadata()?;
158 let len = metadata.len();
159
160 if len > 0 && metadata.file_type().is_file() {
161 // No MAP_POPULATE: let MADV_HUGEPAGE take effect before page faults.
162 let mmap_result = unsafe { MmapOptions::new().map(&file) };
163 match mmap_result {
164 Ok(mmap) => {
165 #[cfg(target_os = "linux")]
166 {
167 // HUGEPAGE first: must be set before any page faults occur.
168 // Reduces ~25,600 minor faults (4KB) to ~50 (2MB) for 100MB.
169 if len >= 2 * 1024 * 1024 {
170 let _ = mmap.advise(memmap2::Advice::HugePage);
171 }
172 // POPULATE_READ (Linux 5.14+): synchronously prefaults all pages
173 // using huge pages. Falls back to WILLNEED on older kernels.
174 if len >= 4 * 1024 * 1024 {
175 if mmap.advise(memmap2::Advice::PopulateRead).is_err() {
176 let _ = mmap.advise(memmap2::Advice::WillNeed);
177 }
178 } else {
179 let _ = mmap.advise(memmap2::Advice::WillNeed);
180 }
181 }
182 return Ok(FileData::Mmap(mmap));
183 }
184 Err(_) => {
185 // mmap failed — fall back to read
186 let mut buf = vec![0u8; len as usize];
187 let n = read_full(&mut &file, &mut buf)?;
188 buf.truncate(n);
189 return Ok(FileData::Owned(buf));
190 }
191 }
192 } else if !metadata.file_type().is_file() {
193 // Non-regular file (pipe, FIFO, device, process substitution) — read from open fd.
194 // Pipes report len=0 from stat(), so we must always try to read regardless of len.
195 let mut buf = Vec::new();
196 let mut reader = file;
197 reader.read_to_end(&mut buf)?;
198 Ok(FileData::Owned(buf))
199 } else {
200 Ok(FileData::Owned(Vec::new()))
201 }
202}
203
204/// Read a file always using read() syscall (no mmap).
205/// Faster than mmap for 10MB files: read() handles page faults in-kernel
206/// with batched PTE allocation (~0.5ms), while mmap triggers ~2560
207/// user-space minor faults (~1-2µs each = 2.5-5ms on CI runners).
208pub fn read_file_direct(path: &Path) -> io::Result<FileData> {
209 let file = open_noatime(path)?;
210 let metadata = file.metadata()?;
211 let len = metadata.len();
212
213 if len > 0 && metadata.file_type().is_file() {
214 let mut buf = vec![0u8; len as usize];
215 let n = read_full(&mut &file, &mut buf)?;
216 buf.truncate(n);
217 Ok(FileData::Owned(buf))
218 } else if !metadata.file_type().is_file() {
219 let mut buf = Vec::new();
220 let mut reader = file;
221 reader.read_to_end(&mut buf)?;
222 Ok(FileData::Owned(buf))
223 } else {
224 Ok(FileData::Owned(Vec::new()))
225 }
226}
227
228/// Get file size without reading it (for byte-count-only optimization).
229pub fn file_size(path: &Path) -> io::Result<u64> {
230 Ok(fs::metadata(path)?.len())
231}
232
233/// Read all bytes from stdin into a Vec.
234/// On Linux, uses raw libc::read() to bypass Rust's StdinLock/BufReader overhead.
235/// Uses a direct read() loop into a pre-allocated buffer instead of read_to_end(),
236/// which avoids Vec's grow-and-probe pattern (extra read() calls and memcpy).
237/// Callers should enlarge the pipe buffer via fcntl(F_SETPIPE_SZ) before calling.
238/// Uses the full spare capacity for each read() to minimize syscalls.
239pub fn read_stdin() -> io::Result<Vec<u8>> {
240 #[cfg(target_os = "linux")]
241 return read_stdin_raw();
242
243 #[cfg(not(target_os = "linux"))]
244 read_stdin_generic()
245}
246
247/// Raw libc::read() implementation for Linux — bypasses Rust's StdinLock
248/// and BufReader layers entirely. StdinLock uses an internal 8KB BufReader
249/// which adds an extra memcpy for every read; raw read() goes directly
250/// from the kernel pipe buffer to our Vec.
251///
252/// Pre-allocates 16MB to cover most workloads (benchmark = 10MB) without
253/// over-allocating. For inputs > 16MB, doubles capacity on demand.
254/// Each read() uses the full spare capacity to maximize bytes per syscall.
255///
256/// Note: callers (ftac, ftr, fbase64) are expected to enlarge the pipe
257/// buffer via fcntl(F_SETPIPE_SZ) before calling this function. We don't
258/// do it here to avoid accidentally shrinking a previously enlarged pipe.
259#[cfg(target_os = "linux")]
260fn read_stdin_raw() -> io::Result<Vec<u8>> {
261 const PREALLOC: usize = 16 * 1024 * 1024;
262
263 let mut buf: Vec<u8> = Vec::with_capacity(PREALLOC);
264
265 loop {
266 let spare_cap = buf.capacity() - buf.len();
267 if spare_cap < 1024 * 1024 {
268 // Grow by doubling (or at least 64MB) to minimize realloc count
269 let new_cap = (buf.capacity() * 2).max(buf.len() + PREALLOC);
270 buf.reserve(new_cap - buf.capacity());
271 }
272 let spare_cap = buf.capacity() - buf.len();
273 let start = buf.len();
274
275 // SAFETY: we read into the uninitialized spare capacity and extend
276 // set_len only by the number of bytes actually read.
277 let ret = unsafe {
278 libc::read(
279 0,
280 buf.as_mut_ptr().add(start) as *mut libc::c_void,
281 spare_cap,
282 )
283 };
284 if ret < 0 {
285 let err = io::Error::last_os_error();
286 if err.kind() == io::ErrorKind::Interrupted {
287 continue;
288 }
289 return Err(err);
290 }
291 if ret == 0 {
292 break;
293 }
294 unsafe { buf.set_len(start + ret as usize) };
295 }
296
297 Ok(buf)
298}
299
300/// Splice piped stdin to a memfd, then mmap for zero-copy access.
301/// Uses splice(2) to move data from the stdin pipe directly into a memfd's
302/// page cache (kernel→kernel, no userspace copy). Returns a mutable mmap.
303/// Returns None if stdin is not a pipe or splice fails.
304///
305/// For translate operations: caller can modify the mmap'd data in-place.
306/// For filter operations (delete, cut): caller reads from the mmap.
307#[cfg(target_os = "linux")]
308pub fn splice_stdin_to_mmap() -> io::Result<Option<memmap2::MmapMut>> {
309 use std::os::unix::io::FromRawFd;
310
311 // Check if stdin is a pipe
312 let mut stat: libc::stat = unsafe { std::mem::zeroed() };
313 if unsafe { libc::fstat(0, &mut stat) } != 0 {
314 return Ok(None);
315 }
316 if (stat.st_mode & libc::S_IFMT) != libc::S_IFIFO {
317 return Ok(None);
318 }
319
320 // Create memfd for receiving spliced data.
321 // Use raw syscall to avoid glibc version dependency (memfd_create added in glibc 2.27,
322 // but the syscall works on any kernel >= 3.17). This fixes cross-compilation to
323 // aarch64-unknown-linux-gnu with older sysroots.
324 let memfd =
325 unsafe { libc::syscall(libc::SYS_memfd_create, c"stdin_splice".as_ptr(), 0u32) as i32 };
326 if memfd < 0 {
327 return Ok(None); // memfd_create not supported, fallback
328 }
329
330 // Splice all data from stdin pipe to memfd (zero-copy: kernel moves pipe pages)
331 let mut total: usize = 0;
332 loop {
333 let n = unsafe {
334 libc::splice(
335 0,
336 std::ptr::null_mut(),
337 memfd,
338 std::ptr::null_mut(),
339 // Splice up to 1GB at a time (kernel will limit to actual pipe data)
340 1024 * 1024 * 1024,
341 libc::SPLICE_F_MOVE,
342 )
343 };
344 if n > 0 {
345 total += n as usize;
346 } else if n == 0 {
347 break; // EOF
348 } else {
349 let err = io::Error::last_os_error();
350 if err.kind() == io::ErrorKind::Interrupted {
351 continue;
352 }
353 unsafe { libc::close(memfd) };
354 return Ok(None); // splice failed, fallback to read
355 }
356 }
357
358 if total == 0 {
359 unsafe { libc::close(memfd) };
360 return Ok(None);
361 }
362
363 // Truncate memfd to exact data size. splice() may leave the memfd larger than
364 // `total` (page-aligned), and mmap would map the full file including zero padding.
365 // Without ftruncate, callers get a mmap with garbage/zero bytes beyond `total`.
366 if unsafe { libc::ftruncate(memfd, total as libc::off_t) } != 0 {
367 unsafe { libc::close(memfd) };
368 return Ok(None);
369 }
370
371 // Wrap memfd in a File for memmap2 API, then mmap it.
372 // MAP_SHARED allows in-place modification; populate prefaults pages.
373 let file = unsafe { File::from_raw_fd(memfd) };
374 let mmap = unsafe { MmapOptions::new().populate().map_mut(&file) };
375 drop(file); // Close memfd fd (mmap stays valid, kernel holds reference)
376
377 match mmap {
378 Ok(mut mm) => {
379 // Advise kernel for sequential access + hugepages
380 unsafe {
381 libc::madvise(
382 mm.as_mut_ptr() as *mut libc::c_void,
383 total,
384 libc::MADV_SEQUENTIAL,
385 );
386 if total >= 2 * 1024 * 1024 {
387 libc::madvise(
388 mm.as_mut_ptr() as *mut libc::c_void,
389 total,
390 libc::MADV_HUGEPAGE,
391 );
392 }
393 }
394 Ok(Some(mm))
395 }
396 Err(_) => Ok(None),
397 }
398}
399
400/// Generic read_stdin for non-Linux platforms.
401#[cfg(not(target_os = "linux"))]
402fn read_stdin_generic() -> io::Result<Vec<u8>> {
403 const PREALLOC: usize = 16 * 1024 * 1024;
404 const READ_BUF: usize = 4 * 1024 * 1024;
405
406 let mut stdin = io::stdin().lock();
407 let mut buf: Vec<u8> = Vec::with_capacity(PREALLOC);
408
409 loop {
410 let spare_cap = buf.capacity() - buf.len();
411 if spare_cap < READ_BUF {
412 buf.reserve(PREALLOC);
413 }
414 let spare_cap = buf.capacity() - buf.len();
415
416 let start = buf.len();
417 unsafe { buf.set_len(start + spare_cap) };
418 match stdin.read(&mut buf[start..start + spare_cap]) {
419 Ok(0) => {
420 buf.truncate(start);
421 break;
422 }
423 Ok(n) => {
424 buf.truncate(start + n);
425 }
426 Err(e) if e.kind() == io::ErrorKind::Interrupted => {
427 buf.truncate(start);
428 continue;
429 }
430 Err(e) => return Err(e),
431 }
432 }
433
434 Ok(buf)
435}
436
437/// Read as many bytes as possible into buf, retrying on partial reads.
438/// Ensures the full buffer is filled (or EOF reached), avoiding the
439/// probe-read overhead of read_to_end.
440/// Fast path: regular file reads usually return the full buffer on the first call.
441#[inline]
442fn read_full(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
443 // Fast path: first read() usually fills the entire buffer for regular files
444 let n = reader.read(buf)?;
445 if n == buf.len() || n == 0 {
446 return Ok(n);
447 }
448 // Slow path: partial read — retry to fill buffer (pipes, slow devices)
449 let mut total = n;
450 while total < buf.len() {
451 match reader.read(&mut buf[total..]) {
452 Ok(0) => break,
453 Ok(n) => total += n,
454 Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
455 Err(e) => return Err(e),
456 }
457 }
458 Ok(total)
459}