lcpfs 2026.1.102

LCP File System - A ZFS-inspired copy-on-write filesystem for Rust
// Copyright 2025 LunaOS Contributors
// SPDX-License-Identifier: Apache-2.0

//! # On-Disk Data Structures
//!
//! This module defines the fundamental on-disk structures used by LCPFS.
//! These structures are designed for direct serialization to/from disk
//! blocks and maintain compatibility with the ZFS on-disk format concepts.
//!
//! ## Key Structures
//!
//! - [`Blkptr`]: The atomic unit of storage - a self-verifying block pointer
//! - [`Dva`]: Device Virtual Address - location of data on a vdev
//! - [`Hyperblock`]: Root of the pool's block tree (equivalent to ZFS uberblock)
//! - [`DnodePhys`]: Physical representation of a DMU object
//! - [`VdevLabel`]: Metadata at the start/end of each device
//!
//! ## Memory Layout
//!
//! All structures use `#[repr(C)]` for predictable memory layout and direct
//! disk I/O. Field sizes and alignments match ZFS conventions.

// ═══════════════════════════════════════════════════════════════════════════════
// DMU OBJECT TYPES
// ═══════════════════════════════════════════════════════════════════════════════

/// Object types managed by the Data Management Unit (DMU).
///
/// Each object in an LCPFS pool has a type that determines how its data
/// is interpreted and managed.
#[repr(u8)]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DmuObjectType {
    /// Unallocated or invalid object.
    None = 0,
    /// Directory of objects (root of object set).
    ObjectDirectory = 1,
    /// Array of object references.
    ObjectArray = 2,
    /// Packed name-value list.
    PackedNvList = 3,
    /// Size of packed nvlist.
    PackedNvListSize = 4,
    /// Block pointer object.
    BpObj = 5,
    /// Block pointer object header.
    BpObjHdr = 6,
    /// Space map header.
    SpaceMapHeader = 7,
    /// Space map data.
    SpaceMap = 8,
    /// Intent log.
    IntentLog = 9,
    /// Dnode (object metadata).
    Dnode = 10,
    /// Object set.
    ObjSet = 11,
    /// DSL directory.
    DslDir = 12,
    /// DSL directory child map.
    DslDirChildMap = 13,
    /// DSL dataset snapshot map.
    DslDsSnapMap = 14,
    /// DSL properties.
    DslProps = 15,
    /// DSL dataset.
    DslDataset = 16,
    /// ZPL znode (file/directory inode).
    Znode = 17,
    /// Old-style ACL.
    OldAcl = 18,
    /// Plain file contents.
    PlainFileContents = 19,
    /// Directory contents.
    DirectoryContents = 20,
    /// Master node for ZPL.
    MasterNode = 21,
    /// Set of unlinked objects.
    UnlinkedSet = 22,
    /// ZVOL data.
    Zvol = 23,
    /// ZVOL properties.
    ZvolProp = 24,
}

// ═══════════════════════════════════════════════════════════════════════════════
// CONSTANTS
// ═══════════════════════════════════════════════════════════════════════════════

/// Size of the vdev label in bytes (256 KiB).
///
/// Each device has four labels: two at the start and two at the end.
pub const VDEV_LABEL_SIZE: usize = 256 * 1024;

/// Size of a hyperblock in bytes (1 KiB).
///
/// Reduced from ZFS's 4 KiB to fit 128 slots in the 128 KiB ring buffer.
pub const UBERBLOCK_SIZE: usize = 1024;

/// Size of a block pointer in bytes (128 bytes).
pub const BLKPTR_SIZE: usize = 128;

/// Minimum block size in bytes (512 bytes - sector size).
pub const SPA_MINBLOCKSIZE: usize = 512;

/// Maximum block size in bytes (128 KiB).
pub const SPA_MAXBLOCKSIZE: usize = 128 * 1024;

/// LCPFS magic number: `0x007CCFF5`
///
/// Used to identify valid hyperblocks and pool labels.
pub const LCPFS_MAGIC: u64 = 0x007CCFF5;

// ═══════════════════════════════════════════════════════════════════════════════
// BLOCK POINTER (The Atom of Storage)
// ═══════════════════════════════════════════════════════════════════════════════

/// Block pointer - the fundamental unit of data reference in LCPFS.
///
/// A block pointer contains everything needed to locate, verify, and
/// interpret a block of data:
///
/// - **Location**: Up to 3 DVAs (Device Virtual Addresses) for redundancy
/// - **Verification**: 256-bit checksum of the block contents
/// - **Metadata**: Compression type, birth transaction, fill count
///
/// # Redundancy
///
/// The three DVA slots enable:
/// - Single copy (`DVA[0]` only)
/// - Mirrored (`DVA[0]` and `DVA[1]`)
/// - Triple-mirrored or RAID-Z (all three DVAs)
///
/// # Self-Verification
///
/// Every read verifies the stored checksum against the data. Corrupted
/// blocks are automatically repaired from redundant copies.
#[repr(C)]
#[derive(Debug, Clone, Copy)]
pub struct Blkptr {
    /// Device Virtual Addresses (up to 3 copies for redundancy).
    pub dva: [Dva; 3],
    /// Flags and compression type.
    pub flags_compression: u64,
    /// Reserved for future use.
    pub padding: u64,
    /// Transaction group when this block was written.
    pub birth_txg: u64,
    /// Number of non-zero block pointers in this subtree.
    pub fill_count: u64,
    /// 256-bit checksum (BLAKE3 or SHA-256).
    pub checksum: [u64; 4],
}

/// Device Virtual Address - location of data on a virtual device.
///
/// A DVA identifies a specific location within a vdev where block data
/// is stored. The combination of vdev ID and byte offset uniquely
/// identifies any block in the pool.
#[repr(C)]
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub struct Dva {
    /// Virtual device ID within the pool.
    pub vdev: u32,
    /// Byte offset within the vdev.
    pub offset: u64,
}

// ═══════════════════════════════════════════════════════════════════════════════
// HYPERBLOCK (Pool Root)
// ═══════════════════════════════════════════════════════════════════════════════

/// Hyperblock - the root of the pool's block tree.
///
/// Equivalent to ZFS's uberblock, the hyperblock is the entry point for
/// accessing all data in a pool. Multiple hyperblocks are stored in a
/// ring buffer to provide transaction history and crash recovery.
///
/// # Recovery
///
/// On pool import, LCPFS scans all hyperblocks and selects the one with:
/// 1. Valid magic number
/// 2. Highest transaction group number
/// 3. Valid checksum
///
/// This ensures recovery to the most recent consistent state.
#[repr(C)]
#[derive(Debug, Clone, Copy)]
pub struct Hyperblock {
    /// Magic number (`LCPFS_MAGIC`).
    pub magic: u64,
    /// On-disk format version.
    pub version: u64,
    /// Transaction group number.
    pub txg: u64,
    /// Sum of all vdev GUIDs (pool integrity check).
    pub guid_sum: u64,
    /// Timestamp when this hyperblock was written.
    pub timestamp: u64,
    /// Block pointer to the MOS (Meta Object Set).
    pub rootbp: Blkptr,
}

// ═══════════════════════════════════════════════════════════════════════════════
// VDEV LABEL
// ═══════════════════════════════════════════════════════════════════════════════

/// Vdev label - metadata at the start and end of each device.
///
/// Each device has four labels (L0, L1 at start; L2, L3 at end) for
/// redundancy. The label contains:
///
/// - Boot block area (for bootable pools)
/// - Pool configuration (nvlist format)
/// - Ring buffer of hyperblocks
#[repr(C)]
pub struct VdevLabel {
    /// Reserved space for boot code.
    pub blank_space: [u8; 8 * 1024],
    /// Boot header area.
    pub boot_header: [u8; 8 * 1024],
    /// Name-value pairs describing pool configuration.
    pub nv_pairs: [u8; 112 * 1024],
    /// Ring buffer of hyperblocks for transaction history.
    pub hyperblocks: [Hyperblock; 128],
}

// ═══════════════════════════════════════════════════════════════════════════════
// IMPLEMENTATIONS
// ═══════════════════════════════════════════════════════════════════════════════

impl Hyperblock {
    /// Create a new hyperblock for the given transaction group.
    ///
    /// # Arguments
    ///
    /// * `txg` - Transaction group number
    pub fn new(txg: u64) -> Self {
        Self {
            magic: LCPFS_MAGIC,
            version: 1, // LCPFS v1.0
            txg,
            guid_sum: 0,
            timestamp: 0,
            rootbp: Blkptr::zero(),
        }
    }
}

impl Blkptr {
    /// Create a zeroed block pointer (represents a hole).
    pub fn zero() -> Self {
        Self {
            dva: [Dva { vdev: 0, offset: 0 }; 3],
            flags_compression: 0,
            padding: 0,
            birth_txg: 0,
            fill_count: 0,
            checksum: [0; 4],
        }
    }

    /// Check if this block pointer represents a hole (no data).
    ///
    /// Holes are sparse regions that read as zeros but don't consume
    /// disk space. A block is a hole if:
    /// - birth_txg is 0 (block was never written), OR
    /// - All DVAs are empty (vdev=0 AND offset=0 for all three)
    ///
    /// Note: We can't just check offset==0 because legitimate blocks
    /// could be allocated at disk offset 0.
    pub fn is_hole(&self) -> bool {
        // Primary check: birth_txg=0 means block was never written
        if self.birth_txg == 0 {
            return true;
        }

        // Secondary check: all DVAs are empty
        self.dva[0].vdev == 0
            && self.dva[0].offset == 0
            && self.dva[1].vdev == 0
            && self.dva[1].offset == 0
            && self.dva[2].vdev == 0
            && self.dva[2].offset == 0
    }

    /// Deserialize a block pointer from raw bytes.
    ///
    /// # Arguments
    ///
    /// * `bytes` - Raw bytes (must be exactly `BLKPTR_SIZE` bytes)
    ///
    /// # Errors
    ///
    /// Returns an error if the byte slice is not the correct size.
    ///
    /// # Safety
    ///
    /// This uses `read_unaligned` internally because the source buffer
    /// may not be properly aligned for `Blkptr`.
    pub fn from_bytes(bytes: &[u8]) -> Result<Self, &'static str> {
        if bytes.len() != BLKPTR_SIZE {
            return Err("Invalid block pointer size");
        }
        // SAFETY INVARIANTS:
        // 1. bytes.len() == BLKPTR_SIZE (128 bytes) - checked above
        // 2. Blkptr is #[repr(C)] with stable layout
        // 3. read_unaligned handles misaligned input buffer
        // 4. All Blkptr fields are primitive types (u64, u32, arrays)
        // 5. Function returns owned Blkptr (no lifetime issues)
        //
        // VERIFICATION: TODO - Prove Blkptr layout matches ZFS on-disk format
        //
        // JUSTIFICATION:
        // Block pointers stored in indirect blocks and dnodes.
        // from_bytes() provides deserialization from disk buffers.
        Ok(unsafe { core::ptr::read_unaligned(bytes.as_ptr() as *const Blkptr) })
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// DNODE PHYSICAL
// ═══════════════════════════════════════════════════════════════════════════════

/// Dnode - physical representation of a DMU object.
///
/// Every object in LCPFS (files, directories, ZAPs, etc.) is represented
/// by a dnode. The dnode contains:
///
/// - Object type and metadata
/// - Block pointers to object data
/// - Bonus buffer for small inline data
///
/// # Block Tree
///
/// For objects larger than 3 blocks, the `blkptr` array contains indirect
/// block pointers. The `indirection_levels` field indicates the tree depth.
#[repr(C)]
#[derive(Debug, Clone, Copy)]
pub struct DnodePhys {
    /// Object type (see [`DmuObjectType`]).
    pub object_type: u8,
    /// Number of indirection levels (0 = direct blocks).
    pub indirection_levels: u8,
    /// Number of block pointers in use (1-3).
    pub nblkptr: u8,
    /// Object family (for grouping related objects).
    pub family: u8,
    /// Security type.
    pub sec_type: u8,
    /// Padding for alignment.
    pub pad: [u8; 3],
    /// Highest block ID allocated.
    pub max_blkid: u64,
    /// Physical bytes used by this object.
    pub used_bytes: u64,
    /// Reserved for future use.
    pub pad2: [u64; 4],
    /// Block pointers to object data (up to 3 direct/indirect).
    pub blkptr: [Blkptr; 3],
    /// Bonus buffer for type-specific inline data (64 bytes).
    pub bonus: [u8; 64],
}

impl DnodePhys {
    /// Create a zeroed dnode.
    pub fn zero() -> Self {
        Self {
            object_type: 0,
            indirection_levels: 0,
            nblkptr: 0,
            family: 0,
            sec_type: 0,
            pad: [0; 3],
            max_blkid: 0,
            used_bytes: 0,
            pad2: [0; 4],
            blkptr: [Blkptr::zero(); 3],
            bonus: [0; 64],
        }
    }
}