farena 0.3.0

A file-backed arena allocator using pread for memory byte storage
Documentation
use std::fs::File;
use std::io::{Error, Result};
use std::os::unix::fs::FileExt;

use crate::Location;

fn make_short_read_error(expected: usize, got: usize, offset: u32) -> Error {
    Error::other(format!(
        "short read: expected {expected} bytes, got {got} at offset {offset}"
    ))
}

/// Read-phase arena that retrieves stored data via `pread(2)`.
///
/// Constructed from file handles returned by
/// [`FileArenaWriter::finish`](crate::FileArenaWriter::finish).
/// Reads are atomic and require no seeking, making `FileArena` safe
/// to share across threads via `&FileArena`.
#[derive(Debug)]
pub struct FileArena {
    files: Vec<File>,
}

impl FileArena {
    /// Creates a new arena from a list of file handles.
    ///
    /// # Ordering Contract
    ///
    /// File at position `i` in `files` must have been produced by a
    /// [`FileArenaWriter`] created with `FileArenaWriter::new(i)`.
    /// Violating this contract causes silent data corruption at read time.
    ///
    /// # Errors
    ///
    /// Returns an error if `files` is empty.
    pub fn new(files: Vec<File>) -> Result<Self> {
        if files.is_empty() {
            return Err(Error::other("FileArena::new: files vector is empty"));
        }
        Ok(Self { files })
    }

    /// Reads stored bytes at the given location.
    ///
    /// # Errors
    ///
    /// Returns an error if the underlying `pread(2)` fails or returns fewer
    /// bytes than expected.
    pub fn get(&self, loc: Location) -> Result<Vec<u8>> {
        if loc.is_empty() {
            return Ok(Vec::new());
        }
        let mut buf = vec![0u8; loc.len() as usize];
        let bytes_read =
            self.files[loc.file_index() as usize].read_at(&mut buf, u64::from(loc.offset()))?;
        if bytes_read != buf.len() {
            return Err(make_short_read_error(buf.len(), bytes_read, loc.offset()));
        }
        Ok(buf)
    }

    /// Reads stored bytes, appending into an existing buffer.
    ///
    /// # Errors
    ///
    /// Returns an error if the underlying `pread(2)` fails or returns fewer
    /// bytes than expected.
    pub fn get_into(&self, loc: Location, out: &mut Vec<u8>) -> Result<()> {
        if loc.is_empty() {
            return Ok(());
        }
        let len = loc.len() as usize;
        out.reserve(len);
        let start = out.len();
        unsafe {
            out.set_len(start + len);
        }
        let read_result = self.files[loc.file_index() as usize]
            .read_at(&mut out[start..], u64::from(loc.offset()));

        match read_result {
            Ok(n) if n == len => Ok(()),
            Ok(n) => {
                out.truncate(start);
                Err(make_short_read_error(len, n, loc.offset()))
            }
            Err(e) => {
                out.truncate(start);
                Err(e)
            }
        }
    }

    /// Reads stored bytes, appending directly into an existing `String`.
    ///
    /// # Errors
    ///
    /// Returns an `io::Error` if the bytes are not valid UTF-8, or if the underlying
    /// `pread(2)` fails or returns fewer bytes than expected.
    pub fn get_str_into(&self, loc: Location, out: &mut String) -> Result<()> {
        if loc.is_empty() {
            return Ok(());
        }
        let len = loc.len() as usize;
        let start = out.len();

        let bytes = unsafe { out.as_mut_vec() };
        bytes.reserve(len);
        unsafe {
            std::ptr::write_bytes(bytes.as_mut_ptr().add(start), 0, len);
            bytes.set_len(start + len);
        }

        match self.files[loc.file_index() as usize]
            .read_at(&mut bytes[start..], u64::from(loc.offset()))
        {
            Ok(n) if n == len => {}
            Ok(n) => {
                unsafe { bytes.set_len(start) };
                return Err(make_short_read_error(len, n, loc.offset()));
            }
            Err(e) => {
                unsafe { bytes.set_len(start) };
                return Err(e);
            }
        }

        if let Err(e) = std::str::from_utf8(&bytes[start..]) {
            unsafe { bytes.set_len(start) };
            return Err(Error::other(format!(
                "stored bytes are not valid UTF-8: {e}"
            )));
        }

        Ok(())
    }

    /// Reads stored bytes, appending directly into an existing `String` without
    /// UTF-8 validation.
    ///
    /// # Safety
    ///
    /// The caller must guarantee that the bytes stored at `loc` are valid UTF-8.
    ///
    /// # Errors
    ///
    /// Returns an `io::Error` if the underlying `pread(2)` fails or returns fewer
    /// bytes than expected.
    pub unsafe fn get_str_into_unchecked(&self, loc: Location, out: &mut String) -> Result<()> {
        if loc.is_empty() {
            return Ok(());
        }
        let len = loc.len() as usize;
        let start = out.len();

        // SAFETY: as_mut_vec gives direct access to the String's backing allocation.
        // We guarantee the written bytes are valid UTF-8 (caller contract, checked
        // in debug via debug_assert! below).
        let bytes = unsafe { out.as_mut_vec() };
        bytes.reserve(len);

        // Initialize to zeros first to avoid uninitialized memory, then set length.
        // SAFETY: we zero-initialize the bytes before extending length.
        unsafe {
            let ptr = bytes.as_mut_ptr().add(start);
            std::ptr::write_bytes(ptr, 0, len);
            bytes.set_len(start + len);
        }

        let read_result = self.files[loc.file_index() as usize]
            .read_at(&mut bytes[start..], u64::from(loc.offset()));

        // On any error path restore `start` so the String is never left with
        // uninitialized bytes — covers both I/O errors and short reads.
        match read_result {
            Ok(n) if n == len => {}
            Ok(n) => {
                // SAFETY: truncating back to valid initialized length
                unsafe {
                    bytes.set_len(start);
                }
                return Err(make_short_read_error(len, n, loc.offset()));
            }
            Err(e) => {
                // SAFETY: truncating back to valid initialized length
                unsafe {
                    bytes.set_len(start);
                }
                return Err(e);
            }
        }

        // In unchecked mode, the caller guarantees UTF-8 validity
        // We still debug-assert in debug builds for extra safety
        debug_assert!(
            std::str::from_utf8(&out.as_bytes()[start..]).is_ok(),
            "farena::get_str_into_unchecked: bytes at {loc:?} are not valid UTF-8"
        );
        Ok(())
    }

    /// Returns the number of backing files.
    #[must_use]
    pub fn file_count(&self) -> usize {
        self.files.len()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    const _: () = {
        const fn assert_sync<T: Sync>() {}
        const fn assert_send<T: Send>() {}
        assert_sync::<FileArena>();
        assert_send::<FileArena>();
    };

    #[test]
    fn short_read_error_format() {
        let err = make_short_read_error(100, 50, 1234);
        assert_eq!(
            err.to_string(),
            "short read: expected 100 bytes, got 50 at offset 1234"
        );
    }
}