armdb 0.1.12

sharded bitcask key-value storage optimized for NVMe
Documentation
use std::fs::{File, OpenOptions};
use std::path::Path;

use crate::error::{DbError, DbResult};
use crate::io::aligned_buf::AlignedBuf;

#[cfg(target_os = "macos")]
unsafe extern "C" {
    fn fcntl(fd: std::os::raw::c_int, cmd: std::os::raw::c_int, ...) -> std::os::raw::c_int;
}

/// Read `len` bytes from `file` at `offset`.
///
/// On Linux with O_DIRECT this uses aligned buffers.
/// On macOS/other platforms this uses standard pread.
pub fn pread_value(file: &File, offset: u64, len: usize) -> DbResult<Vec<u8>> {
    use std::os::unix::fs::FileExt;

    // For O_DIRECT we need aligned buffers and aligned reads.
    let sector_size = 4096;
    let aligned_offset = offset & !(sector_size - 1);
    let diff = (offset - aligned_offset) as usize;
    let aligned_len = (diff + len + sector_size as usize - 1) & !(sector_size as usize - 1);

    let mut buf = AlignedBuf::zeroed(aligned_len);

    // TODO: On O_DIRECT, partial reads can leave `total_read` unaligned to sector_size,
    // causing the next read_at call to use a misaligned buffer offset. In practice this
    // doesn't happen on NVMe with aligned block reads, but for correctness each iteration
    // should re-align the remaining read to sector boundaries.
    let mut total_read = 0;
    while total_read < diff + len {
        let r = file.read_at(&mut buf[total_read..], aligned_offset + total_read as u64)?;
        if r == 0 {
            break; // EOF
        }
        total_read += r;
    }

    if total_read < diff + len {
        return Err(DbError::Io(std::io::Error::new(
            std::io::ErrorKind::UnexpectedEof,
            "failed to read full value",
        )));
    }

    let mut result = vec![0u8; len];
    result.copy_from_slice(&buf[diff..diff + len]);
    Ok(result)
}

#[cfg(feature = "var-collections")]
/// Read a single 4096-byte aligned block from `file`.
/// Short reads (end of file) leave remaining bytes zeroed.
/// Read a 4096-byte aligned block. Returns `(block, bytes_read)`.
/// Short reads (at end of file) are zero-padded; `bytes_read` reflects
/// actual data so callers can avoid caching partial blocks.
pub fn pread_block(file: &File, block_offset: u64) -> DbResult<(AlignedBuf, usize)> {
    use std::os::unix::fs::FileExt;
    debug_assert!(
        block_offset & 4095 == 0,
        "block_offset must be 4096-aligned"
    );

    let mut buf = AlignedBuf::zeroed(4096);
    let n = file.read_at(&mut buf, block_offset)?;
    Ok((buf, n))
}

/// Open a file for reading.
///
/// On Linux: uses O_DIRECT for direct I/O.
/// On macOS: uses F_NOCACHE as best-effort bypass of page cache.
pub fn open_read(path: &Path) -> DbResult<File> {
    let file = OpenOptions::new().read(true).open(path)?;

    #[cfg(target_os = "macos")]
    {
        use std::os::unix::io::AsRawFd;
        // F_NOCACHE = 48 on macOS
        unsafe {
            fcntl(file.as_raw_fd(), 48 /* F_NOCACHE */, 1);
        }
    }

    Ok(file)
}

/// Open a file for writing (append mode). Creates the file if it doesn't exist.
///
/// On Linux: uses O_DIRECT.
/// On macOS: uses F_NOCACHE.
pub fn open_write(path: &Path) -> DbResult<File> {
    let file = OpenOptions::new()
        .create(true)
        .read(true)
        .write(true)
        .truncate(false)
        .open(path)?;

    #[cfg(target_os = "macos")]
    {
        use std::os::unix::io::AsRawFd;
        unsafe {
            fcntl(file.as_raw_fd(), 48 /* F_NOCACHE */, 1);
        }
    }

    Ok(file)
}

/// Append data to a file at the given offset. Returns number of bytes written.
pub fn pwrite_at(file: &File, data: &[u8], offset: u64) -> DbResult<()> {
    use std::os::unix::fs::FileExt;
    file.write_all_at(data, offset)?;
    Ok(())
}

/// Sync file data to disk.
pub fn fsync(file: &File) -> DbResult<()> {
    file.sync_data()?;
    Ok(())
}

/// Read `len` bytes from an encrypted data file at `offset`, decrypting pages.
///
/// Same alignment logic as `pread_value`, plus per-page AES-256-GCM decryption
/// using tags from the corresponding `.tags` file.
#[cfg(feature = "encryption")]
pub fn pread_value_encrypted(
    file: &File,
    tag_file: &crate::io::tags::TagFile,
    cipher: &crate::crypto::PageCipher,
    file_id: u32,
    offset: u64,
    len: usize,
) -> DbResult<Vec<u8>> {
    use std::os::unix::fs::FileExt;

    let sector_size: u64 = 4096;
    let aligned_offset = offset & !(sector_size - 1);
    let diff = (offset - aligned_offset) as usize;
    let aligned_len = (diff + len + sector_size as usize - 1) & !(sector_size as usize - 1);

    let mut buf = AlignedBuf::zeroed(aligned_len);

    let mut total_read = 0;
    while total_read < diff + len {
        let r = file.read_at(&mut buf[total_read..], aligned_offset + total_read as u64)?;
        if r == 0 {
            break;
        }
        total_read += r;
    }

    if total_read < diff + len {
        return Err(DbError::Io(std::io::Error::new(
            std::io::ErrorKind::UnexpectedEof,
            "failed to read full value",
        )));
    }

    // Decrypt each 4096-byte page
    let start_page = aligned_offset / sector_size;
    let num_pages = aligned_len / sector_size as usize;
    let tags = tag_file.read_tags(start_page, num_pages)?;
    #[allow(clippy::needless_range_loop)]
    for i in 0..num_pages {
        let page_start = i * sector_size as usize;
        let page = &mut buf[page_start..page_start + sector_size as usize];
        cipher.decrypt_page(file_id, start_page + i as u64, page, &tags[i])?;
    }

    let mut result = vec![0u8; len];
    result.copy_from_slice(&buf[diff..diff + len]);
    Ok(result)
}