#![cfg(target_os = "linux")]
use crate::{Error, Result};
use std::fs::{File, OpenOptions};
use std::os::unix::io::{AsRawFd, FromRawFd};
use std::path::Path;
pub(crate) fn open_write_new(path: &Path, use_direct: bool) -> Result<(File, bool)> {
let path_cstr = path_to_cstr(path)?;
let mut flags = libc::O_WRONLY | libc::O_CREAT | libc::O_EXCL | libc::O_CLOEXEC;
if use_direct {
flags |= libc::O_DIRECT;
}
let fd = unsafe { libc::open(path_cstr.as_ptr(), flags, 0o600_i32) };
if fd >= 0 {
return Ok((unsafe { File::from_raw_fd(fd) }, use_direct));
}
let err = std::io::Error::last_os_error();
if use_direct && err.raw_os_error() == Some(libc::EINVAL) {
let flags_no_direct = libc::O_WRONLY | libc::O_CREAT | libc::O_EXCL | libc::O_CLOEXEC;
let fd2 = unsafe { libc::open(path_cstr.as_ptr(), flags_no_direct, 0o600_i32) };
if fd2 >= 0 {
return Ok((unsafe { File::from_raw_fd(fd2) }, false));
}
return Err(Error::Io(std::io::Error::last_os_error()));
}
Err(Error::Io(err))
}
pub(crate) fn open_read(path: &Path, use_direct: bool) -> Result<(File, bool)> {
let path_cstr = path_to_cstr(path)?;
let mut flags = libc::O_RDONLY | libc::O_CLOEXEC;
if use_direct {
flags |= libc::O_DIRECT;
}
let fd = unsafe { libc::open(path_cstr.as_ptr(), flags, 0) };
if fd >= 0 {
return Ok((unsafe { File::from_raw_fd(fd) }, use_direct));
}
let err = std::io::Error::last_os_error();
if use_direct && err.raw_os_error() == Some(libc::EINVAL) {
let flags_no_direct = libc::O_RDONLY | libc::O_CLOEXEC;
let fd2 = unsafe { libc::open(path_cstr.as_ptr(), flags_no_direct, 0) };
if fd2 >= 0 {
return Ok((unsafe { File::from_raw_fd(fd2) }, false));
}
return Err(Error::Io(std::io::Error::last_os_error()));
}
Err(Error::Io(err))
}
pub(crate) fn open_append(path: &Path) -> Result<File> {
OpenOptions::new()
.append(true)
.create(true)
.open(path)
.map_err(Error::Io)
}
pub(crate) fn open_write_at(path: &Path) -> Result<File> {
OpenOptions::new()
.write(true)
.create(true)
.truncate(false)
.open(path)
.map_err(Error::Io)
}
pub(crate) fn write_all(file: &File, data: &[u8]) -> Result<()> {
let fd = file.as_raw_fd();
let mut written = 0usize;
while written < data.len() {
let n = unsafe {
libc::write(
fd,
data[written..].as_ptr().cast::<libc::c_void>(),
data.len() - written,
)
};
if n < 0 {
let err = std::io::Error::last_os_error();
if err.kind() == std::io::ErrorKind::Interrupted {
continue;
}
return Err(Error::Io(err));
}
written += n as usize;
}
Ok(())
}
pub(crate) fn write_all_direct(file: &File, data: &[u8], sector_size: u32) -> Result<()> {
use super::{round_up, AlignedBuf};
if data.is_empty() {
return Ok(());
}
let ss = sector_size as usize;
let aligned_len = round_up(data.len(), ss);
let mut buf = AlignedBuf::new(aligned_len, ss)?;
buf.as_mut_slice()[..data.len()].copy_from_slice(data);
let fd = file.as_raw_fd();
let base = buf.as_slice().as_ptr();
let mut written = 0usize;
while written < aligned_len {
let n = unsafe {
libc::pwrite(
fd,
base.add(written).cast::<libc::c_void>(),
aligned_len - written,
written as libc::off_t,
)
};
if n < 0 {
let err = std::io::Error::last_os_error();
if err.kind() == std::io::ErrorKind::Interrupted {
continue;
}
return Err(Error::Io(err));
}
if n == 0 {
return Err(Error::Io(std::io::Error::other(
"pwrite returned 0 in write_all_direct (no progress)",
)));
}
written += n as usize;
}
Ok(())
}
pub(crate) fn write_at(file: &File, offset: u64, data: &[u8]) -> Result<()> {
let fd = file.as_raw_fd();
let mut written = 0usize;
while written < data.len() {
let off = (offset as i64).checked_add(written as i64).ok_or_else(|| {
Error::Io(std::io::Error::new(
std::io::ErrorKind::InvalidInput,
"write_at: offset overflow",
))
})?;
let n = unsafe {
libc::pwrite(
fd,
data[written..].as_ptr().cast::<libc::c_void>(),
data.len() - written,
off as libc::off_t,
)
};
if n < 0 {
let err = std::io::Error::last_os_error();
if err.kind() == std::io::ErrorKind::Interrupted {
continue;
}
return Err(Error::Io(err));
}
if n == 0 {
return Err(Error::Io(std::io::Error::new(
std::io::ErrorKind::WriteZero,
"pwrite returned 0 in write_at (no progress)",
)));
}
written += n as usize;
}
Ok(())
}
pub(crate) fn write_at_direct(file: &File, offset: u64, data: &[u8]) -> Result<()> {
write_at(file, offset, data)
}
pub(crate) fn read_all(file: &File) -> Result<Vec<u8>> {
use std::io::Read;
let mut buf = Vec::new();
let _ = (&*file).read_to_end(&mut buf).map_err(Error::Io)?;
Ok(buf)
}
pub(crate) fn read_all_direct(file: &File, file_size: u64, sector_size: u32) -> Result<Vec<u8>> {
use super::{round_up, AlignedBuf};
if file_size == 0 {
return Ok(Vec::new());
}
let ss = sector_size as usize;
let aligned_len = round_up(file_size as usize, ss);
let mut buf = AlignedBuf::new(aligned_len, ss)?;
let fd = file.as_raw_fd();
let mut total = 0usize;
while total < aligned_len {
let n = unsafe {
libc::pread(
fd,
buf.as_mut_slice()[total..]
.as_mut_ptr()
.cast::<libc::c_void>(),
aligned_len - total,
total as libc::off_t,
)
};
if n < 0 {
let err = std::io::Error::last_os_error();
if err.kind() == std::io::ErrorKind::Interrupted {
continue;
}
return Err(Error::Io(err));
}
if n == 0 {
break;
}
total += n as usize;
}
let trimmed = usize::min(total, file_size as usize);
Ok(buf.as_slice()[..trimmed].to_vec())
}
pub(crate) fn read_range(file: &File, offset: u64, len: usize) -> Result<Vec<u8>> {
let fd = file.as_raw_fd();
let mut buf = vec![0u8; len];
let mut total_read = 0usize;
while total_read < len {
let off = (offset as i64)
.checked_add(total_read as i64)
.ok_or_else(|| {
Error::Io(std::io::Error::new(
std::io::ErrorKind::InvalidInput,
"read_range: offset overflow",
))
})?;
let n = unsafe {
libc::pread(
fd,
buf[total_read..].as_mut_ptr().cast::<libc::c_void>(),
len - total_read,
off as libc::off_t,
)
};
if n < 0 {
let err = std::io::Error::last_os_error();
if err.kind() == std::io::ErrorKind::Interrupted {
continue;
}
return Err(Error::Io(err));
}
if n == 0 {
buf.truncate(total_read);
break;
}
total_read += n as usize;
}
buf.truncate(total_read);
Ok(buf)
}
pub(crate) fn sync_data(file: &File) -> Result<()> {
let fd = file.as_raw_fd();
let ret = unsafe { libc::fdatasync(fd) };
if ret == 0 {
Ok(())
} else {
Err(Error::Io(std::io::Error::last_os_error()))
}
}
pub(crate) fn fcntl_set_rw_hint(file: &File, hint_ordinal: u8) -> Result<()> {
const F_SET_RW_HINT: libc::c_int = 1036;
let kernel_hint: u64 = (hint_ordinal as u64).saturating_add(2);
let fd = file.as_raw_fd();
let ret = unsafe { libc::fcntl(fd, F_SET_RW_HINT, &kernel_hint as *const u64) };
if ret == 0 {
Ok(())
} else {
Err(Error::Io(std::io::Error::last_os_error()))
}
}
pub(crate) fn sync_full(file: &File) -> Result<()> {
let fd = file.as_raw_fd();
let ret = unsafe { libc::fsync(fd) };
if ret == 0 {
Ok(())
} else {
Err(Error::Io(std::io::Error::last_os_error()))
}
}
pub(crate) fn atomic_rename(from: &Path, to: &Path) -> Result<()> {
std::fs::rename(from, to).map_err(Error::Io)
}
pub(crate) fn sync_parent_dir(path: &Path) -> Result<()> {
let parent = path.parent().unwrap_or_else(|| Path::new("."));
let dir = File::open(parent).map_err(Error::Io)?;
let fd = dir.as_raw_fd();
let ret = unsafe { libc::fsync(fd) };
if ret == 0 {
Ok(())
} else {
Err(Error::Io(std::io::Error::last_os_error()))
}
}
pub(crate) fn copy_file(src: &Path, dst: &Path) -> Result<u64> {
std::fs::copy(src, dst).map_err(Error::Io)
}
pub(crate) fn preallocate(file: &File, offset: u64, len: u64) -> Result<()> {
if len == 0 {
return Ok(());
}
let fd = file.as_raw_fd();
let off = offset as libc::off_t;
let len_off = len as libc::off_t;
if std::env::var_os("FSYS_TEST_FORCE_POSIX_FALLOCATE").is_none() {
const FALLOC_FL_KEEP_SIZE: i32 = 0x01;
let ret = unsafe { libc::fallocate(fd, FALLOC_FL_KEEP_SIZE, off, len_off) };
if ret == 0 {
return Ok(());
}
let err = std::io::Error::last_os_error();
let raw = err.raw_os_error().unwrap_or(0);
if raw != 95 && raw != 38 {
return Err(Error::Io(err));
}
}
let ret = unsafe { libc::posix_fallocate(fd, off, len_off) };
if ret == 0 {
Ok(())
} else {
Err(Error::Io(std::io::Error::from_raw_os_error(ret)))
}
}
pub(crate) fn advise(file: &File, offset: u64, len: u64, advice: crate::Advice) -> Result<()> {
let fd = file.as_raw_fd();
let raw_advice: i32 = match advice {
crate::Advice::Normal => libc::POSIX_FADV_NORMAL,
crate::Advice::Sequential => libc::POSIX_FADV_SEQUENTIAL,
crate::Advice::Random => libc::POSIX_FADV_RANDOM,
crate::Advice::WillNeed => libc::POSIX_FADV_WILLNEED,
crate::Advice::DontNeed => libc::POSIX_FADV_DONTNEED,
};
let ret =
unsafe { libc::posix_fadvise(fd, offset as libc::off_t, len as libc::off_t, raw_advice) };
if ret == 0 {
Ok(())
} else {
Err(Error::Io(std::io::Error::from_raw_os_error(ret)))
}
}
pub(crate) fn probe_sector_size(path: &Path) -> u32 {
let path_cstr = match path_to_cstr(path) {
Ok(c) => c,
Err(_) => return 512,
};
let mut st: libc::statfs = unsafe { std::mem::zeroed() };
let ret = unsafe { libc::statfs(path_cstr.as_ptr(), &mut st) };
if ret == 0 && st.f_bsize > 0 {
let bs = st.f_bsize as u64;
if (512..=65536).contains(&bs) {
return bs as u32;
}
}
512
}
pub(crate) fn probe_direct_io_available() -> bool {
true
}
fn path_to_cstr(path: &Path) -> Result<std::ffi::CString> {
use std::os::unix::ffi::OsStrExt;
std::ffi::CString::new(path.as_os_str().as_bytes()).map_err(|_| Error::InvalidPath {
path: path.to_owned(),
reason: "path contains an interior NUL byte".into(),
})
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write as _;
use std::sync::atomic::{AtomicU64, Ordering};
static COUNTER: AtomicU64 = AtomicU64::new(0);
fn tmp_path(suffix: &str) -> std::path::PathBuf {
let n = COUNTER.fetch_add(1, Ordering::Relaxed);
std::env::temp_dir().join(format!(
"fsys_linux_{}_{}_{}",
std::process::id(),
n,
suffix
))
}
struct TmpFile(std::path::PathBuf);
impl Drop for TmpFile {
fn drop(&mut self) {
let _ = std::fs::remove_file(&self.0);
}
}
#[test]
fn test_open_write_new_creates_file() {
let path = tmp_path("open_write_new");
let _guard = TmpFile(path.clone());
let (f, _direct) = open_write_new(&path, false).expect("open_write_new");
drop(f);
assert!(path.exists());
}
#[test]
fn test_open_write_new_fails_if_already_exists() {
let path = tmp_path("owne_exists");
let _guard = TmpFile(path.clone());
std::fs::write(&path, b"existing").expect("create");
let result = open_write_new(&path, false);
assert!(result.is_err(), "must fail when file already exists");
}
#[test]
fn test_write_all_and_read_all_roundtrip() {
let path = tmp_path("write_read");
let _guard = TmpFile(path.clone());
let (f, _) = open_write_new(&path, false).expect("open");
write_all(&f, b"hello fsys").expect("write");
drop(f);
let (rf, _) = open_read(&path, false).expect("open read");
let data = read_all(&rf).expect("read");
assert_eq!(data, b"hello fsys");
}
#[test]
fn test_write_at_and_read_range() {
let path = tmp_path("write_at");
let _guard = TmpFile(path.clone());
std::fs::write(&path, b"aaaaaaaaa").expect("create");
let f = open_write_at(&path).expect("open write at");
write_at(&f, 2, b"bbb").expect("write at");
drop(f);
let (rf, _) = open_read(&path, false).expect("open read");
let chunk = read_range(&rf, 2, 3).expect("read range");
assert_eq!(chunk, b"bbb");
}
#[test]
fn test_sync_data_succeeds_on_open_file() {
let path = tmp_path("sync_data");
let _guard = TmpFile(path.clone());
let (f, _) = open_write_new(&path, false).expect("open");
write_all(&f, b"data").expect("write");
sync_data(&f).expect("sync_data");
}
#[test]
fn test_sync_full_succeeds_on_open_file() {
let path = tmp_path("sync_full");
let _guard = TmpFile(path.clone());
let (f, _) = open_write_new(&path, false).expect("open");
write_all(&f, b"full sync test").expect("write");
sync_full(&f).expect("sync_full");
}
#[test]
fn test_atomic_rename_replaces_destination() {
let src = tmp_path("rename_src");
let dst = tmp_path("rename_dst");
let _gs = TmpFile(src.clone());
let _gd = TmpFile(dst.clone());
std::fs::write(&src, b"new").expect("write src");
std::fs::write(&dst, b"old").expect("write dst");
atomic_rename(&src, &dst).expect("rename");
assert!(!src.exists());
assert_eq!(std::fs::read(&dst).expect("read dst"), b"new");
}
#[test]
fn test_copy_file_produces_identical_content() {
let src = tmp_path("copy_src");
let dst = tmp_path("copy_dst");
let _gs = TmpFile(src.clone());
let _gd = TmpFile(dst.clone());
std::fs::write(&src, b"copy me").expect("write");
let bytes = copy_file(&src, &dst).expect("copy");
assert_eq!(bytes, 7);
assert_eq!(std::fs::read(&dst).expect("read"), b"copy me");
}
#[test]
fn test_probe_sector_size_returns_at_least_512() {
let dir = std::env::temp_dir();
let size = probe_sector_size(&dir);
assert!(size >= 512, "sector size {}", size);
}
#[test]
fn test_open_append_creates_and_appends() {
let path = tmp_path("append");
let _guard = TmpFile(path.clone());
{
let mut f = open_append(&path).expect("open append");
f.write_all(b"line1\n").expect("write");
}
{
let mut f = open_append(&path).expect("open append 2");
f.write_all(b"line2\n").expect("write");
}
let content = std::fs::read(&path).expect("read");
assert_eq!(content, b"line1\nline2\n");
}
}
#[allow(dead_code)]
#[derive(Debug, Clone, Copy)]
pub(crate) struct FiemapExtent {
pub logical: u64,
pub physical: u64,
pub length: u64,
pub flags: u32,
}
#[allow(dead_code)]
pub(crate) const FIEMAP_EXTENT_LAST: u32 = 0x0000_0001;
#[allow(dead_code)]
pub(crate) const FIEMAP_EXTENT_UNKNOWN: u32 = 0x0000_0002;
#[allow(dead_code)]
pub(crate) const FIEMAP_EXTENT_DELALLOC: u32 = 0x0000_0004;
#[allow(dead_code)]
pub(crate) const FIEMAP_EXTENT_ENCODED: u32 = 0x0000_0008;
#[allow(dead_code)]
pub(crate) const FIEMAP_EXTENT_DATA_ENCRYPTED: u32 = 0x0000_0080;
#[allow(dead_code)]
pub(crate) const FIEMAP_EXTENT_NOT_ALIGNED: u32 = 0x0000_0100;
#[allow(dead_code)]
pub(crate) const FIEMAP_EXTENT_DATA_INLINE: u32 = 0x0000_0200;
#[allow(dead_code)]
pub(crate) const FIEMAP_EXTENT_DATA_TAIL: u32 = 0x0000_0400;
#[allow(dead_code)]
pub(crate) const FIEMAP_EXTENT_UNWRITTEN: u32 = 0x0000_0800;
#[allow(dead_code)]
#[inline]
pub(crate) fn fiemap_extent_is_usable_for_dsm(flags: u32) -> bool {
const UNUSABLE: u32 = FIEMAP_EXTENT_UNKNOWN
| FIEMAP_EXTENT_DELALLOC
| FIEMAP_EXTENT_ENCODED
| FIEMAP_EXTENT_DATA_ENCRYPTED
| FIEMAP_EXTENT_NOT_ALIGNED
| FIEMAP_EXTENT_DATA_INLINE
| FIEMAP_EXTENT_DATA_TAIL
| FIEMAP_EXTENT_UNWRITTEN;
(flags & UNUSABLE) == 0
}
#[allow(dead_code)]
pub(crate) fn fiemap_extents(
fd: std::os::unix::io::RawFd,
start: u64,
length: u64,
) -> Result<Vec<FiemapExtent>> {
#[repr(C)]
#[derive(Default, Clone, Copy)]
struct KernelFiemapExtent {
fe_logical: u64,
fe_physical: u64,
fe_length: u64,
fe_reserved64: [u64; 2],
fe_flags: u32,
fe_reserved: [u32; 3],
}
#[repr(C)]
#[derive(Default)]
struct KernelFiemapHeader {
fm_start: u64,
fm_length: u64,
fm_flags: u32,
fm_mapped_extents: u32,
fm_extent_count: u32,
fm_reserved: u32,
}
const FS_IOC_FIEMAP: libc::c_ulong = 0xc020_660b;
const EXTENTS_PER_CALL: u32 = 64;
const MAX_CALLS: usize = 4;
let mut out: Vec<FiemapExtent> = Vec::new();
let mut current_start = start;
let mut remaining = length;
for _call in 0..MAX_CALLS {
if remaining == 0 {
break;
}
let header_size = std::mem::size_of::<KernelFiemapHeader>();
let extent_size = std::mem::size_of::<KernelFiemapExtent>();
let buf_size = header_size + (EXTENTS_PER_CALL as usize) * extent_size;
let mut buf: Vec<u8> = vec![0u8; buf_size];
debug_assert_eq!(
buf.as_ptr() as usize % std::mem::align_of::<u64>(),
0,
"fiemap header allocation must be u64-aligned"
);
unsafe {
let header_ptr = buf.as_mut_ptr() as *mut KernelFiemapHeader;
std::ptr::write(
header_ptr,
KernelFiemapHeader {
fm_start: current_start,
fm_length: remaining,
fm_flags: 0,
fm_mapped_extents: 0,
fm_extent_count: EXTENTS_PER_CALL,
fm_reserved: 0,
},
);
}
let rc = unsafe { libc::ioctl(fd, FS_IOC_FIEMAP, buf.as_mut_ptr() as *mut libc::c_void) };
if rc < 0 {
return Err(Error::Io(std::io::Error::last_os_error()));
}
let mapped = unsafe {
let header_ptr = buf.as_ptr() as *const KernelFiemapHeader;
std::ptr::read(header_ptr).fm_mapped_extents
};
if mapped == 0 {
break;
}
let mut last_logical_end: u64 = current_start;
let mut hit_last = false;
for i in 0..(mapped as usize).min(EXTENTS_PER_CALL as usize) {
let extent: KernelFiemapExtent = unsafe {
let ext_ptr =
buf.as_ptr().add(header_size + i * extent_size) as *const KernelFiemapExtent;
std::ptr::read_unaligned(ext_ptr)
};
last_logical_end = extent.fe_logical.saturating_add(extent.fe_length);
if (extent.fe_flags & FIEMAP_EXTENT_LAST) != 0 {
hit_last = true;
}
out.push(FiemapExtent {
logical: extent.fe_logical,
physical: extent.fe_physical,
length: extent.fe_length,
flags: extent.fe_flags,
});
}
if hit_last {
break;
}
if last_logical_end <= current_start {
break;
}
let advanced = last_logical_end - current_start;
if advanced >= remaining {
break;
}
current_start = last_logical_end;
remaining -= advanced;
}
Ok(out)
}
pub(crate) fn punch_hole(file: &File, offset: u64, len: u64) -> Result<()> {
if len == 0 {
return Ok(());
}
let fd = file.as_raw_fd();
let mode = libc::FALLOC_FL_PUNCH_HOLE | libc::FALLOC_FL_KEEP_SIZE;
let rc = unsafe { libc::fallocate(fd, mode, offset as i64, len as i64) };
if rc == 0 {
Ok(())
} else {
Err(Error::Io(std::io::Error::last_os_error()))
}
}
pub(crate) fn zero_range(file: &File, offset: u64, len: u64) -> Result<()> {
if len == 0 {
return Ok(());
}
let fd = file.as_raw_fd();
let mode = libc::FALLOC_FL_ZERO_RANGE | libc::FALLOC_FL_KEEP_SIZE;
let rc = unsafe { libc::fallocate(fd, mode, offset as i64, len as i64) };
if rc == 0 {
Ok(())
} else {
Err(Error::Io(std::io::Error::last_os_error()))
}
}