lcpfs 2026.1.102

// Copyright 2025 LunaOS Contributors
// SPDX-License-Identifier: Apache-2.0

//! # LCPFS Data Management Unit (DMU)
//!
//! The DMU is the object-based storage layer of LCPFS, providing:
//! - Object allocation and management
//! - Transaction groups for atomic commits
//! - Buffer management for cached data
//! - Multi-level block indirection
//!
//! ## Architecture (based on OpenZFS)
//!
//! ```text
//! ┌─────────────────────────────────────────────────────────────┐
//! │                      Object Sets                            │
//! │           (collections of dnodes, one per dataset)          │
//! ├─────────────────────────────────────────────────────────────┤
//! │                        Dnodes                               │
//! │         (metadata + block pointers for each object)         │
//! ├─────────────────────────────────────────────────────────────┤
//! │                    Transaction Groups                       │
//! │              (atomic batched commits to disk)               │
//! ├─────────────────────────────────────────────────────────────┤
//! │                      Block Layer                            │
//! │              (checksums, compression, encryption)           │
//! └─────────────────────────────────────────────────────────────┘
//! ```

extern crate alloc;

use alloc::boxed::Box;
use alloc::collections::BTreeMap;
use alloc::vec;
use alloc::vec::Vec;
use core::sync::atomic::{AtomicU64, Ordering};

use crate::fscore::structs::{
    BLKPTR_SIZE, Blkptr, DmuObjectType, DnodePhys, Hyperblock, LCPFS_MAGIC,
};
use crate::io::pipeline::Pipeline;
use crate::mgmt::format::LcpfsFormatter;
use crate::{FsError, FsResult};

// ═══════════════════════════════════════════════════════════════════════════════
// CONSTANTS (matching OpenZFS)
// ═══════════════════════════════════════════════════════════════════════════════

/// Default data block size (128 KiB like ZFS)
///
/// This is used as the default when no explicit block size is configured.
/// See [`set_default_block_size`] to configure at runtime.
pub const DMU_DEFAULT_BLOCK_SIZE: u64 = 128 * 1024;

/// Legacy alias for backward compatibility
#[deprecated(
    since = "2026.1.100",
    note = "use DMU_DEFAULT_BLOCK_SIZE or get_block_size() instead"
)]
pub const DMU_BLOCK_SIZE: u64 = DMU_DEFAULT_BLOCK_SIZE;

/// Minimum block size (512 bytes)
pub const DMU_MIN_BLOCKSIZE: u64 = 512;

/// Maximum block size (16 MiB)
pub const DMU_MAX_BLOCKSIZE: u64 = 16 * 1024 * 1024;

// Runtime configurable block size
static CONFIGURED_BLOCK_SIZE: AtomicU64 = AtomicU64::new(DMU_DEFAULT_BLOCK_SIZE);

/// Set the default block size for new objects.
///
/// # Arguments
///
/// * `size` - Block size in bytes (must be power of 2, between 512 and 16 MiB)
///
/// # Returns
///
/// * `Ok(())` if the block size was set successfully
/// * `Err(&'static str)` if the block size is invalid
///
/// # Example
///
/// ```rust,ignore
/// // Use 512 KiB blocks for large sequential files
/// lcpfs::lcpfs_dmu::set_default_block_size(512 * 1024)?;
///
/// // Use 4 KiB blocks for random access workloads
/// lcpfs::lcpfs_dmu::set_default_block_size(4 * 1024)?;
/// ```
pub fn set_default_block_size(size: u64) -> Result<(), &'static str> {
    // Validate: must be power of 2
    if size == 0 || (size & (size - 1)) != 0 {
        return Err("Block size must be a power of 2");
    }

    // Validate: must be within bounds
    if size < DMU_MIN_BLOCKSIZE {
        return Err("Block size must be at least 512 bytes");
    }

    if size > DMU_MAX_BLOCKSIZE {
        return Err("Block size must be at most 16 MiB");
    }

    CONFIGURED_BLOCK_SIZE.store(size, Ordering::SeqCst);
    Ok(())
}

/// Get the currently configured block size.
///
/// Returns the block size set by [`set_default_block_size`], or the default
/// (128 KiB) if not explicitly configured.
pub fn get_block_size() -> u64 {
    CONFIGURED_BLOCK_SIZE.load(Ordering::SeqCst)
}

/// Maximum single I/O size (64 MiB, from OpenZFS DMU_MAX_ACCESS)
pub const DMU_MAX_ACCESS: u64 = 64 * 1024 * 1024;

/// Block pointers per indirect block at 4K block size
const PTRS_PER_BLOCK: u64 = 4096 / BLKPTR_SIZE as u64;

/// Sentinel for new object allocation
pub const DMU_NEW_OBJECT: u64 = u64::MAX;

/// Sentinel for end of object
pub const DMU_OBJECT_END: u64 = u64::MAX;

/// Object ID for the meta-dnode (contains all other dnodes)
pub const DMU_META_DNODE_OBJECT: u64 = 0;

/// Number of dnodes per block (4096 / 512 = 8)
const DNODES_PER_BLOCK: u64 = 8;

/// Chunk size for object allocation (per-CPU allocator grabs this many slots)
const OBJECT_ALLOC_CHUNK_SIZE: u64 = 128;

// ═══════════════════════════════════════════════════════════════════════════════
// DMU BUFFER (dmu_buf_t equivalent)
// ═══════════════════════════════════════════════════════════════════════════════

/// Buffer for cached object data.
///
/// Based on OpenZFS `dmu_buf_t`. Represents a cached portion of an object's
/// data with associated metadata.
#[derive(Debug)]
pub struct DmuBuf {
    /// Object ID this buffer belongs to
    pub db_object: u64,
    /// Byte offset within the object
    pub db_offset: u64,
    /// Size of the buffer in bytes
    pub db_size: u64,
    /// The actual data
    pub db_data: Vec<u8>,
    /// Dirty flag - true if modified since last sync
    pub db_dirty: bool,
    /// Transaction group this buffer was dirtied in
    pub db_dirty_txg: u64,
}

impl DmuBuf {
    /// Create a new buffer
    pub fn new(object: u64, offset: u64, size: u64) -> Self {
        Self {
            db_object: object,
            db_offset: offset,
            db_size: size,
            db_data: vec![0u8; size as usize],
            db_dirty: false,
            db_dirty_txg: 0,
        }
    }

    /// Create a buffer with existing data
    pub fn with_data(object: u64, offset: u64, data: Vec<u8>) -> Self {
        let size = data.len() as u64;
        Self {
            db_object: object,
            db_offset: offset,
            db_size: size,
            db_data: data,
            db_dirty: false,
            db_dirty_txg: 0,
        }
    }

    /// Mark buffer as dirty
    pub fn mark_dirty(&mut self, txg: u64) {
        self.db_dirty = true;
        self.db_dirty_txg = txg;
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// TRANSACTION HOLD (dmu_tx_hold_t equivalent)
// ═══════════════════════════════════════════════════════════════════════════════

/// Type of operation being held in a transaction
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TxHoldType {
    /// Write operation
    Write,
    /// Free operation
    Free,
    /// ZAP operation
    Zap,
    /// Bonus buffer modification
    Bonus,
    /// Space map update
    SpaceMap,
}

/// Hold on an object for a transaction.
///
/// Based on OpenZFS `dmu_tx_hold_t`. Tracks what operations are planned
/// for an object within a transaction.
#[derive(Debug)]
pub struct DmuTxHold {
    /// Object ID (DMU_NEW_OBJECT for new allocations)
    pub txh_object: u64,
    /// Type of hold
    pub txh_type: TxHoldType,
    /// Byte offset for write/free
    pub txh_offset: u64,
    /// Length for write/free
    pub txh_length: u64,
    /// Estimated space needed for this operation
    pub txh_space_towrite: u64,
}

// ═══════════════════════════════════════════════════════════════════════════════
// TRANSACTION (dmu_tx_t equivalent)
// ═══════════════════════════════════════════════════════════════════════════════

/// Transaction state
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TxState {
    /// Transaction created but not yet assigned
    Open,
    /// Transaction assigned to a TXG
    Assigned,
    /// Transaction committed
    Committed,
    /// Transaction aborted
    Aborted,
}

/// DMU transaction for atomic operations.
///
/// Based on OpenZFS `dmu_tx_t`. Groups multiple operations into an atomic
/// unit that either fully commits or fully aborts.
///
/// ## Usage Pattern (from OpenZFS)
///
/// ```ignore
/// // 1. Create transaction
/// let tx = dmu_tx_create(os);
///
/// // 2. Declare intentions
/// dmu_tx_hold_write(tx, object, offset, length);
///
/// // 3. Assign to TXG (may block or fail)
/// dmu_tx_assign(tx, TxWaitType::Wait)?;
///
/// // 4. Perform operations
/// dmu_write(os, object, offset, data, tx);
///
/// // 5. Commit
/// dmu_tx_commit(tx);
/// ```
pub struct DmuTx {
    /// Object set this transaction belongs to
    tx_objset_id: u64,
    /// Transaction state
    tx_state: TxState,
    /// Transaction group assigned to (0 if not yet assigned)
    tx_txg: u64,
    /// List of holds
    tx_holds: Vec<DmuTxHold>,
    /// Total space to write
    tx_space_towrite: u64,
    /// Start time (for delay calculation)
    tx_start_time: u64,
}

/// Wait behavior for transaction assignment
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TxWaitType {
    /// Block until space is available
    Wait,
    /// Return error if no space
    NoWait,
    /// Bypass write throttle
    NoThrottle,
}

impl DmuTx {
    /// Create a new transaction for an object set
    pub fn create(objset_id: u64) -> Self {
        Self {
            tx_objset_id: objset_id,
            tx_state: TxState::Open,
            tx_txg: 0,
            tx_holds: Vec::new(),
            tx_space_towrite: 0,
            tx_start_time: crate::get_time(),
        }
    }

    /// Hold a write operation
    ///
    /// Based on OpenZFS `dmu_tx_hold_write`. Declares intent to write
    /// to the specified range of an object.
    pub fn hold_write(&mut self, object: u64, offset: u64, length: u64) -> FsResult<()> {
        if length > DMU_MAX_ACCESS {
            return Err(FsError::InvalidArgument {
                reason: "write length exceeds DMU_MAX_ACCESS",
            });
        }

        // Calculate space needed (worst case: uncompressed + metadata)
        let block_size = get_block_size();
        let space_needed = length + (length / block_size + 1) * BLKPTR_SIZE as u64;

        self.tx_holds.push(DmuTxHold {
            txh_object: object,
            txh_type: TxHoldType::Write,
            txh_offset: offset,
            txh_length: length,
            txh_space_towrite: space_needed,
        });

        self.tx_space_towrite += space_needed;
        Ok(())
    }

    /// Hold a free operation
    ///
    /// Based on OpenZFS `dmu_tx_hold_free`. Declares intent to free
    /// a range of blocks from an object.
    pub fn hold_free(&mut self, object: u64, offset: u64, length: u64) -> FsResult<()> {
        self.tx_holds.push(DmuTxHold {
            txh_object: object,
            txh_type: TxHoldType::Free,
            txh_offset: offset,
            txh_length: length,
            txh_space_towrite: 0, // Frees don't need space
        });
        Ok(())
    }

    /// Hold a bonus buffer modification
    pub fn hold_bonus(&mut self, object: u64) -> FsResult<()> {
        self.tx_holds.push(DmuTxHold {
            txh_object: object,
            txh_type: TxHoldType::Bonus,
            txh_offset: 0,
            txh_length: 0,
            txh_space_towrite: BLKPTR_SIZE as u64,
        });
        self.tx_space_towrite += BLKPTR_SIZE as u64;
        Ok(())
    }

    /// Hold for ZAP operation
    pub fn hold_zap(&mut self, object: u64) -> FsResult<()> {
        // ZAP operations may need to grow the ZAP
        let space_needed = get_block_size();
        self.tx_holds.push(DmuTxHold {
            txh_object: object,
            txh_type: TxHoldType::Zap,
            txh_offset: 0,
            txh_length: 0,
            txh_space_towrite: space_needed,
        });
        self.tx_space_towrite += space_needed;
        Ok(())
    }

    /// Assign transaction to a transaction group
    ///
    /// Based on OpenZFS `dmu_tx_assign`. Assigns this transaction to
    /// a TXG, potentially blocking or failing based on wait type.
    ///
    /// # Errors
    /// - `FsError::InvalidArgument` - Transaction not in open state
    /// - `FsError::DiskFull` - Insufficient space for transaction
    pub fn assign(&mut self, objset: &Objset, _wait: TxWaitType) -> FsResult<()> {
        if self.tx_state != TxState::Open {
            return Err(FsError::InvalidArgument {
                reason: "transaction not in open state",
            });
        }

        // Check space availability against pool free space
        if self.tx_space_towrite > 0 {
            let allocator = crate::util::alloc::ALLOCATOR.lock();
            let available = allocator.total_free;
            drop(allocator); // Release lock early

            if self.tx_space_towrite > available {
                // All wait types currently fail immediately since we don't have
                // async waiting infrastructure. TxWaitType::Wait would normally
                // block until space becomes available via GC or sync.
                return Err(FsError::DiskFull {
                    needed_bytes: self.tx_space_towrite,
                });
            }
        }

        // Assign to current open TXG
        self.tx_txg = objset.os_txg.load(Ordering::Acquire);
        self.tx_state = TxState::Assigned;

        Ok(())
    }

    /// Commit the transaction
    ///
    /// Based on OpenZFS `dmu_tx_commit`. Finalizes the transaction,
    /// releasing holds and marking buffers for sync.
    ///
    /// **Note on persistence (BUG #3)**:
    /// This function does NOT immediately write data to disk. Following ZFS semantics,
    /// transactions are batched into Transaction Groups (TXG) and persisted together
    /// for better performance. To ensure data is on disk, the caller must:
    /// 1. Call `txg_sync()` on the object set, OR
    /// 2. Call `Pool::sync()` which triggers a full TXG commit
    ///
    /// Data is marked dirty during write operations and will be persisted when
    /// `txg_sync()` is called. The `txg_sync()` function now properly writes:
    /// - All dirty dnodes and their data blocks
    /// - The meta-dnode referencing all objects
    /// - The hyperblock with updated rootbp (for crash recovery)
    pub fn commit(mut self) {
        if self.tx_state == TxState::Assigned {
            self.tx_state = TxState::Committed;
            // Transaction is now committed - data is marked dirty and will be
            // persisted on the next txg_sync() call.
        }
        // Transaction is consumed (moved into this function)
    }

    /// Abort the transaction
    ///
    /// Based on OpenZFS `dmu_tx_abort`. Cancels an unassigned transaction.
    pub fn abort(mut self) {
        if self.tx_state == TxState::Open {
            self.tx_state = TxState::Aborted;
            // In real implementation:
            // - Invoke abort callbacks
            // - Release temporary reservations
        }
        // Transaction is consumed
    }

    /// Get the assigned TXG
    pub fn txg(&self) -> u64 {
        self.tx_txg
    }

    /// Check if transaction is assigned
    pub fn is_assigned(&self) -> bool {
        self.tx_state == TxState::Assigned
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// DNODE MANAGEMENT
// ═══════════════════════════════════════════════════════════════════════════════

/// In-memory dnode with reference counting.
///
/// Based on OpenZFS `dnode_t`. Wraps the on-disk `DnodePhys` with
/// runtime state for caching and locking.
pub struct Dnode {
    /// Object ID
    pub dn_object: u64,
    /// On-disk dnode data
    pub dn_phys: DnodePhys,
    /// Reference count
    pub dn_holds: AtomicU64,
    /// Dirty flag
    pub dn_dirty: bool,
    /// TXG when marked dirty
    pub dn_dirty_txg: u64,
    /// Data buffers for this dnode
    pub dn_dbufs: BTreeMap<u64, DmuBuf>,
}

impl Dnode {
    /// Create a new in-memory dnode
    pub fn new(object: u64, phys: DnodePhys) -> Self {
        Self {
            dn_object: object,
            dn_phys: phys,
            dn_holds: AtomicU64::new(0),
            dn_dirty: false,
            dn_dirty_txg: 0,
            dn_dbufs: BTreeMap::new(),
        }
    }

    /// Allocate and initialize a new dnode
    ///
    /// Based on OpenZFS `dnode_allocate`.
    pub fn allocate(
        object: u64,
        object_type: DmuObjectType,
        _blocksize: u64,
        _bonustype: DmuObjectType,
        _bonuslen: u8,
    ) -> Self {
        let mut phys = DnodePhys::zero();
        phys.object_type = object_type as u8;
        phys.nblkptr = 1;
        // bonustype and bonuslen would be set in bonus buffer

        Self::new(object, phys)
    }

    /// Increment reference count
    ///
    /// Based on OpenZFS `dnode_hold`.
    pub fn hold(&self) {
        self.dn_holds.fetch_add(1, Ordering::AcqRel);
    }

    /// Decrement reference count
    ///
    /// Based on OpenZFS `dnode_rele`.
    pub fn rele(&self) -> u64 {
        self.dn_holds.fetch_sub(1, Ordering::AcqRel) - 1
    }

    /// Mark dnode as dirty
    pub fn set_dirty(&mut self, txg: u64) {
        self.dn_dirty = true;
        self.dn_dirty_txg = txg;
    }

    /// Get object type
    pub fn object_type(&self) -> DmuObjectType {
        // Safe conversion since we control the values
        match self.dn_phys.object_type {
            0 => DmuObjectType::None,
            17 => DmuObjectType::Znode,
            19 => DmuObjectType::PlainFileContents,
            20 => DmuObjectType::DirectoryContents,
            _ => DmuObjectType::None,
        }
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// OBJECT SET (objset_t equivalent)
// ═══════════════════════════════════════════════════════════════════════════════

/// Object set - a collection of objects forming a dataset.
///
/// Based on OpenZFS `objset_t`. Each dataset (filesystem, snapshot, clone)
/// has its own object set containing all its dnodes.
pub struct Objset {
    /// Object set ID (dataset ID)
    pub os_id: u64,
    /// Root block pointer for this object set
    pub os_rootbp: Blkptr,
    /// Current open transaction group
    pub os_txg: AtomicU64,
    /// In-memory dnodes (object_id -> Dnode)
    pub os_dnodes: BTreeMap<u64, Dnode>,
    /// Next object ID to allocate
    os_next_object: AtomicU64,
    /// Free object ID slots (from freed objects)
    os_free_objects: Vec<u64>,
    /// Per-CPU allocation chunks (simplified - single allocator)
    os_alloc_next: AtomicU64,
    /// Read-only flag
    pub os_readonly: bool,
    /// Encryption key (if dataset is encrypted)
    os_encryption_key: Option<crate::crypto::aesni::EncryptionKey>,
}

impl Objset {
    /// Create a new object set
    pub fn new(id: u64) -> Self {
        Self {
            os_id: id,
            os_rootbp: Blkptr::zero(),
            os_txg: AtomicU64::new(1),
            os_dnodes: BTreeMap::new(),
            os_next_object: AtomicU64::new(1), // Object 0 is meta-dnode
            os_free_objects: Vec::new(),
            os_alloc_next: AtomicU64::new(1),
            os_readonly: false,
            os_encryption_key: None,
        }
    }

    /// Set encryption key for this object set
    pub fn set_encryption_key(&mut self, key: crate::crypto::aesni::EncryptionKey) {
        self.os_encryption_key = Some(key);
    }

    /// Get encryption key for this object set
    pub fn get_encryption_key(&self) -> Option<&crate::crypto::aesni::EncryptionKey> {
        self.os_encryption_key.as_ref()
    }

    /// Remove encryption key
    pub fn clear_encryption_key(&mut self) {
        self.os_encryption_key = None;
    }

    /// Allocate a new object
    ///
    /// Based on OpenZFS `dmu_object_alloc`. Returns a new object ID.
    pub fn object_alloc(
        &mut self,
        object_type: DmuObjectType,
        blocksize: u64,
        bonustype: DmuObjectType,
        bonuslen: u8,
        tx: &DmuTx,
    ) -> FsResult<u64> {
        if !tx.is_assigned() {
            return Err(FsError::InvalidArgument {
                reason: "transaction not assigned",
            });
        }

        // Try to reuse a freed object ID first
        let object_id = if let Some(id) = self.os_free_objects.pop() {
            id
        } else {
            // Allocate new object ID
            self.os_next_object.fetch_add(1, Ordering::AcqRel)
        };

        // Create and insert the dnode
        let mut dnode = Dnode::allocate(object_id, object_type, blocksize, bonustype, bonuslen);
        dnode.set_dirty(tx.txg());
        self.os_dnodes.insert(object_id, dnode);

        Ok(object_id)
    }

    /// Claim a specific object ID
    ///
    /// Based on OpenZFS `dmu_object_claim`. Used during replay/restore.
    pub fn object_claim(
        &mut self,
        object: u64,
        object_type: DmuObjectType,
        blocksize: u64,
        bonustype: DmuObjectType,
        bonuslen: u8,
        tx: &DmuTx,
    ) -> FsResult<()> {
        if self.os_dnodes.contains_key(&object) {
            return Err(FsError::AlreadyExists);
        }

        let mut dnode = Dnode::allocate(object, object_type, blocksize, bonustype, bonuslen);
        dnode.set_dirty(tx.txg());
        self.os_dnodes.insert(object, dnode);

        // Update next_object if needed
        let next = self.os_next_object.load(Ordering::Acquire);
        if object >= next {
            self.os_next_object.store(object + 1, Ordering::Release);
        }

        Ok(())
    }

    /// Free an object
    ///
    /// Based on OpenZFS `dmu_object_free`.
    pub fn object_free(&mut self, object: u64, tx: &DmuTx) -> FsResult<()> {
        if !tx.is_assigned() {
            return Err(FsError::InvalidArgument {
                reason: "transaction not assigned",
            });
        }

        if let Some(dnode) = self.os_dnodes.get_mut(&object) {
            // Mark as freed
            dnode.dn_phys.object_type = DmuObjectType::None as u8;
            dnode.set_dirty(tx.txg());

            // Add to free list for reuse
            self.os_free_objects.push(object);
            Ok(())
        } else {
            Err(FsError::NotFound)
        }
    }

    /// Hold (get reference to) an existing dnode
    ///
    /// Based on OpenZFS `dnode_hold`.
    pub fn dnode_hold(&mut self, object: u64) -> FsResult<&mut Dnode> {
        if let Some(dnode) = self.os_dnodes.get_mut(&object) {
            dnode.hold();
            Ok(dnode)
        } else {
            // In real implementation, would load from disk
            Err(FsError::NotFound)
        }
    }

    /// Release a dnode reference
    ///
    /// Based on OpenZFS `dnode_rele`.
    pub fn dnode_rele(&self, object: u64) {
        if let Some(dnode) = self.os_dnodes.get(&object) {
            dnode.rele();
        }
    }

    /// Get object info (metadata)
    ///
    /// Based on OpenZFS `dmu_object_info`.
    pub fn object_info(&self, object: u64) -> FsResult<ObjectInfo> {
        if let Some(dnode) = self.os_dnodes.get(&object) {
            let block_size = get_block_size();
            Ok(ObjectInfo {
                doi_type: dnode.object_type(),
                doi_bonus_type: DmuObjectType::None,
                doi_bonus_size: 0,
                doi_indirection: dnode.dn_phys.indirection_levels,
                doi_data_block_size: block_size as u32,
                doi_physical_blocks_512: (dnode.dn_phys.used_bytes / 512),
                doi_max_offset: dnode.dn_phys.max_blkid * block_size,
            })
        } else {
            Err(FsError::NotFound)
        }
    }

    /// Read data from an object
    ///
    /// Based on OpenZFS `dmu_read`.
    pub fn read(&mut self, object: u64, offset: u64, length: usize) -> FsResult<Vec<u8>> {
        let dnode = self.os_dnodes.get(&object).ok_or(FsError::NotFound)?;
        // BUG #6 FIX: Use dataset encryption key instead of dummy zero key
        let key = self
            .os_encryption_key
            .as_ref()
            .map(|k| k.key)
            .unwrap_or([0u8; 32]);
        Self::read_dnode_data_with_key(&dnode.dn_phys, offset, length, &key)
    }

    /// Write data to an object
    ///
    /// Based on OpenZFS `dmu_write`.
    pub fn write(&mut self, object: u64, offset: u64, data: &[u8], tx: &DmuTx) -> FsResult<()> {
        if !tx.is_assigned() {
            return Err(FsError::InvalidArgument {
                reason: "transaction not assigned",
            });
        }

        let txg = tx.txg();
        let dnode = self.os_dnodes.get_mut(&object).ok_or(FsError::NotFound)?;
        // BUG #6 FIX: Use dataset encryption key instead of dummy zero key
        let key = self
            .os_encryption_key
            .as_ref()
            .map(|k| k.key)
            .unwrap_or([0u8; 32]);
        Self::write_dnode_data_with_key(&mut dnode.dn_phys, offset, data, txg, &key)?;
        dnode.set_dirty(txg);
        Ok(())
    }

    /// Sync the object set to disk
    ///
    /// Called at TXG commit time to write all dirty data.
    /// This delegates to `txg_sync` which performs the actual persistence.
    ///
    /// **BUG #7 FIX**: Previously this function just cleared dirty flags without
    /// actually writing data to disk. Now it calls `txg_sync` with dev_id 0.
    /// For multi-device pools, use `txg_sync` directly with the correct device ID.
    pub fn sync(&mut self, _txg: u64) -> FsResult<()> {
        // Delegate to txg_sync which handles actual persistence
        // Use device 0 as default - callers with multi-device pools should use txg_sync directly
        self.txg_sync(0)?;
        Ok(())
    }

    /// Read data from a dnode at the given offset (legacy wrapper for unencrypted data)
    ///
    /// **Deprecated**: Use `read_dnode_data_with_key` for encrypted datasets.
    fn read_dnode_data(dnode: &DnodePhys, offset: u64, length: usize) -> FsResult<Vec<u8>> {
        Self::read_dnode_data_with_key(dnode, offset, length, &[0u8; 32])
    }

    /// Read data from a dnode at the given offset with explicit encryption key
    ///
    /// Based on OpenZFS data read path through dmu_read -> dbuf_read.
    ///
    /// BUG #6 FIX: This function now accepts an explicit encryption key instead
    /// of using a hardcoded zero key. For unencrypted datasets, pass `&[0u8; 32]`.
    fn read_dnode_data_with_key(
        dnode: &DnodePhys,
        offset: u64,
        length: usize,
        key: &[u8; 32],
    ) -> FsResult<Vec<u8>> {
        const BLOCK_SIZE: u64 = 4096;

        if length == 0 {
            return Ok(Vec::new());
        }

        let start_block = offset / BLOCK_SIZE;
        let end_block = (offset + length as u64).div_ceil(BLOCK_SIZE);

        let mut result_buffer = Vec::with_capacity(length);

        for blk_idx in start_block..end_block {
            let l0_bp = Self::traverse_with_key(dnode, blk_idx, key)?;

            // Handle hole (unallocated block)
            if l0_bp.is_hole() {
                let zeros_needed =
                    core::cmp::min(BLOCK_SIZE as usize, length - result_buffer.len());
                result_buffer.extend(core::iter::repeat_n(0u8, zeros_needed));
                continue;
            }

            // Derive nonce from block pointer for proper decryption
            let raw_block = Pipeline::read_block_auto_nonce(&l0_bp, key)?;

            let block_offset = if blk_idx == start_block {
                offset % BLOCK_SIZE
            } else {
                0
            };
            let copy_len = core::cmp::min(
                raw_block.len() as u64 - block_offset,
                length as u64 - result_buffer.len() as u64,
            ) as usize;

            let start = block_offset as usize;
            let end = start + copy_len;
            if end <= raw_block.len() {
                result_buffer.extend_from_slice(&raw_block[start..end]);
            }
        }

        Ok(result_buffer)
    }

    /// Traverse the block pointer tree to find the L0 block pointer (legacy wrapper)
    ///
    /// **Deprecated**: Use `traverse_with_key` for encrypted datasets.
    fn traverse(dnode: &DnodePhys, logical_block_index: u64) -> FsResult<Blkptr> {
        Self::traverse_with_key(dnode, logical_block_index, &[0u8; 32])
    }

    /// Traverse the block pointer tree to find the L0 block pointer with explicit encryption key
    ///
    /// Based on OpenZFS dbuf_hold path through dnode block tree.
    ///
    /// BUG #6 FIX: This function accepts an explicit encryption key for reading indirect blocks.
    fn traverse_with_key(
        dnode: &DnodePhys,
        logical_block_index: u64,
        key: &[u8; 32],
    ) -> FsResult<Blkptr> {
        const BLOCK_SIZE: u64 = 4096;
        let level = dnode.indirection_levels;

        // Level 0: Direct block pointers in dnode
        if level == 0 {
            if logical_block_index >= dnode.nblkptr as u64 {
                return Ok(Blkptr::zero());
            }
            return Ok(dnode.blkptr[logical_block_index as usize]);
        }

        // Calculate which top-level indirect block we need
        let blocks_per_l1 = PTRS_PER_BLOCK;
        let blocks_per_l2 = PTRS_PER_BLOCK * PTRS_PER_BLOCK;
        let blocks_per_l3 = PTRS_PER_BLOCK * PTRS_PER_BLOCK * PTRS_PER_BLOCK;

        let (top_bp_idx, remaining_index) = match level {
            1 => {
                let idx = logical_block_index / blocks_per_l1;
                (idx as usize, logical_block_index % blocks_per_l1)
            }
            2 => {
                let idx = logical_block_index / blocks_per_l2;
                (idx as usize, logical_block_index % blocks_per_l2)
            }
            3 => {
                let idx = logical_block_index / blocks_per_l3;
                (idx as usize, logical_block_index % blocks_per_l3)
            }
            _ => {
                let blocks_at_level = PTRS_PER_BLOCK.pow(level as u32);
                let idx = logical_block_index / blocks_at_level;
                (idx as usize, logical_block_index % blocks_at_level)
            }
        };

        if top_bp_idx >= dnode.nblkptr as usize {
            return Ok(Blkptr::zero());
        }

        let mut current_bp = dnode.blkptr[top_bp_idx];
        let mut current_index = remaining_index;
        let mut current_level = level;

        // Descend through indirect blocks
        while current_level > 0 {
            if current_bp.is_hole() {
                return Ok(Blkptr::zero());
            }

            // Derive nonce from block pointer for proper decryption
            let indirect_data = Pipeline::read_block_auto_nonce(&current_bp, key)?;

            let divisor = PTRS_PER_BLOCK.pow((current_level - 1) as u32);
            let ptr_index = (current_index / divisor) as usize;
            current_index %= divisor;

            let ptr_offset = ptr_index * BLKPTR_SIZE;
            if ptr_offset + BLKPTR_SIZE > indirect_data.len() {
                return Err(FsError::Corruption {
                    block: logical_block_index,
                    details: "Indirect block truncated",
                });
            }

            let ptr_slice = &indirect_data[ptr_offset..ptr_offset + BLKPTR_SIZE];
            // SAFETY INVARIANTS:
            // 1. ptr_slice is exactly BLKPTR_SIZE (128 bytes) - bounds checked
            // 2. Blkptr is #[repr(C)] with stable layout (no padding variance)
            // 3. Data written by LCPFS as valid Blkptr during prior txg_sync
            // 4. read_unaligned handles misaligned buffer (disk I/O may not align)
            // 5. All Blkptr fields are primitive types (u64, u32, arrays)
            //
            // VERIFICATION: TODO - Prove Blkptr layout stability across compilations
            //
            // JUSTIFICATION:
            // Indirect blocks store arrays of Blkptr for multi-level indexing.
            // Binary deserialization required for on-disk format compatibility.
            current_bp = unsafe { core::ptr::read_unaligned(ptr_slice.as_ptr() as *const Blkptr) };

            current_level -= 1;
        }

        Ok(current_bp)
    }

    /// Write data to a dnode with full indirect block tree support (legacy wrapper)
    ///
    /// **Deprecated**: Use `write_dnode_data_with_key` for encrypted datasets.
    fn write_dnode_data(dnode: &mut DnodePhys, offset: u64, data: &[u8], txg: u64) -> FsResult<()> {
        Self::write_dnode_data_with_key(dnode, offset, data, txg, &[0u8; 32])
    }

    /// Write data to a dnode with full indirect block tree support and explicit encryption key.
    ///
    /// Based on OpenZFS data write path through dmu_write -> dbuf_dirty.
    ///
    /// # Block Tree Structure
    ///
    /// - Level 0: blkptr[0..3] point directly to data blocks (max 12KB)
    /// - Level 1: blkptr[0..3] → indirect blocks → data (max ~384KB)
    /// - Level 2: blkptr → L1 → L2 → data (max ~12MB)
    /// - Level 3: blkptr → L1 → L2 → L3 → data (max ~384MB)
    ///
    /// BUG #6 FIX: This function accepts an explicit encryption key for reading/writing blocks.
    fn write_dnode_data_with_key(
        dnode: &mut DnodePhys,
        offset: u64,
        data: &[u8],
        txg: u64,
        key: &[u8; 32],
    ) -> FsResult<()> {
        const BLOCK_SIZE: u64 = 4096;

        if data.is_empty() {
            return Ok(());
        }

        let start_block = offset / BLOCK_SIZE;
        let end_block = (offset + data.len() as u64).div_ceil(BLOCK_SIZE);
        let max_block_idx = end_block - 1;

        // Determine required indirection level based on max block index
        let required_level = Self::required_indirection_level(max_block_idx);

        // Grow indirection if needed
        if required_level > dnode.indirection_levels {
            Self::grow_indirection(dnode, required_level, txg)?;
        }

        let mut data_offset = 0usize;

        for blk_idx in start_block..end_block {
            let block_offset = if blk_idx == start_block {
                (offset % BLOCK_SIZE) as usize
            } else {
                0
            };
            let copy_len =
                core::cmp::min(BLOCK_SIZE as usize - block_offset, data.len() - data_offset);

            let mut block_data = [0u8; 4096];

            // Read-modify-write for partial blocks
            // CRITICAL: If we can't read existing data, we MUST fail - not corrupt
            if block_offset > 0 || copy_len < BLOCK_SIZE as usize {
                // Try to get the existing block pointer
                // BUG #6 FIX: Use encryption key for traversing indirect blocks
                match Self::traverse_with_key(dnode, blk_idx, key) {
                    Ok(existing_bp) => {
                        if !existing_bp.is_hole() {
                            // MUST read existing data for partial block write
                            // Failing here means we can't safely do partial write
                            let existing_data = Pipeline::read_block_auto_nonce(&existing_bp, key)
                                .map_err(|e| {
                                    crate::lcpfs_println!(
                                        "[ DMU  ] ERROR: Read-modify-write failed - cannot read \
                                         existing block {} for partial write: {:?}",
                                        blk_idx,
                                        e
                                    );
                                    e
                                })?;
                            let copy_size = core::cmp::min(existing_data.len(), 4096);
                            block_data[..copy_size].copy_from_slice(&existing_data[..copy_size]);
                        }
                        // If hole, block_data stays zeroed which is correct
                    }
                    Err(_) => {
                        // Block doesn't exist yet - this is fine for new blocks
                        // block_data stays zeroed
                    }
                }
            }

            block_data[block_offset..block_offset + copy_len]
                .copy_from_slice(&data[data_offset..data_offset + copy_len]);

            // Write the data block
            let new_bp = Pipeline::write_block_full(&block_data, key, txg)?;

            // Update block pointer tree
            Self::update_block_pointer(dnode, blk_idx, new_bp, txg)?;

            data_offset += copy_len;
        }

        // Update dnode metadata
        let new_size = offset + data.len() as u64;
        if new_size > dnode.used_bytes {
            dnode.used_bytes = new_size;
        }
        if max_block_idx > dnode.max_blkid {
            dnode.max_blkid = max_block_idx;
        }

        Ok(())
    }

    /// Calculate required indirection level for a given block index.
    fn required_indirection_level(block_idx: u64) -> u8 {
        const PTRS_PER_INDIRECT: u64 = 4096 / BLKPTR_SIZE as u64; // 32

        if block_idx < 3 {
            0 // Direct blocks
        } else if block_idx < 3 * PTRS_PER_INDIRECT {
            1 // Single indirect
        } else if block_idx < 3 * PTRS_PER_INDIRECT * PTRS_PER_INDIRECT {
            2 // Double indirect
        } else {
            3 // Triple indirect (supports up to ~384MB with 4K blocks)
        }
    }

    /// Grow the indirection level of a dnode.
    ///
    /// When a file grows beyond what the current indirection level supports,
    /// we need to add a new level. This involves:
    /// 1. Creating new indirect blocks
    /// 2. Moving existing block pointers into the indirect blocks
    /// 3. Updating the dnode to point to the new indirect blocks
    fn grow_indirection(dnode: &mut DnodePhys, target_level: u8, txg: u64) -> FsResult<()> {
        const BLOCK_SIZE: usize = 4096;
        let key = [0u8; 32];

        while dnode.indirection_levels < target_level {
            // Create new indirect blocks containing the old block pointers
            for i in 0..dnode.nblkptr as usize {
                let old_bp = dnode.blkptr[i];
                if old_bp.is_hole() {
                    continue;
                }

                // Create indirect block containing the old pointer
                let mut indirect_block = [0u8; BLOCK_SIZE];
                // SAFETY INVARIANTS:
                // 1. Blkptr is #[repr(C)] with stable layout
                // 2. indirect_block is 4096 bytes, BLKPTR_SIZE is 128 bytes
                // 3. First slot (0..128) is within bounds
                // 4. read from valid Blkptr (old_bp), write to valid buffer
                //
                // JUSTIFICATION:
                // Indirect blocks store arrays of Blkptr. Binary serialization
                // required for on-disk format compatibility.
                unsafe {
                    let bp_bytes = core::slice::from_raw_parts(
                        &old_bp as *const Blkptr as *const u8,
                        BLKPTR_SIZE,
                    );
                    indirect_block[..BLKPTR_SIZE].copy_from_slice(bp_bytes);
                }

                // Write the indirect block
                let indirect_bp = Pipeline::write_block_full(&indirect_block, &key, txg)?;

                // Update dnode to point to indirect block
                dnode.blkptr[i] = indirect_bp;
            }

            dnode.indirection_levels += 1;
        }

        Ok(())
    }

    /// Update a block pointer in the tree at the given logical block index.
    ///
    /// This handles all indirection levels:
    /// - Level 0: Direct update to dnode.blkptr[]
    /// - Level 1+: Traverse/create indirect blocks and update leaf pointer
    fn update_block_pointer(
        dnode: &mut DnodePhys,
        block_idx: u64,
        new_bp: Blkptr,
        txg: u64,
    ) -> FsResult<()> {
        const BLOCK_SIZE: usize = 4096;
        const PTRS_PER_INDIRECT: u64 = (BLOCK_SIZE / BLKPTR_SIZE) as u64; // 32

        let level = dnode.indirection_levels;
        // TODO: Pass encryption key as parameter for encrypted datasets
        let key = [0u8; 32];

        // Level 0: Direct block pointers in dnode
        if level == 0 {
            if block_idx >= 3 {
                return Err(FsError::DiskFull {
                    needed_bytes: block_idx * 4096,
                });
            }
            dnode.blkptr[block_idx as usize] = new_bp;
            if block_idx as u8 >= dnode.nblkptr {
                dnode.nblkptr = block_idx as u8 + 1;
            }
            return Ok(());
        }

        // Calculate which top-level indirect block we need
        let blocks_per_top = PTRS_PER_INDIRECT.pow(level as u32);
        let top_idx = (block_idx / blocks_per_top) as usize;

        if top_idx >= 3 {
            return Err(FsError::DiskFull {
                needed_bytes: block_idx * 4096,
            });
        }

        // Build path from top to leaf
        let mut path_indices = Vec::new();
        let mut remaining = block_idx % blocks_per_top;

        for l in (1..=level).rev() {
            let divisor = PTRS_PER_INDIRECT.pow((l - 1) as u32);
            path_indices.push((remaining / divisor) as usize);
            remaining %= divisor;
        }

        // Read indirect blocks along the path, creating as needed
        let mut indirect_blocks: Vec<Vec<u8>> = Vec::new();
        let mut current_bp = dnode.blkptr[top_idx];

        for depth in 0..level as usize {
            let block_data = if current_bp.is_hole() {
                vec![0u8; BLOCK_SIZE]
            } else {
                // Derive nonce from block pointer for proper decryption
                Pipeline::read_block_auto_nonce(&current_bp, &key)?
            };

            indirect_blocks.push(block_data);

            // Get next level's block pointer (unless we're at the leaf)
            if depth < (level - 1) as usize {
                let idx = path_indices[depth];
                let ptr_offset = idx * BLKPTR_SIZE;

                if ptr_offset + BLKPTR_SIZE <= indirect_blocks[depth].len() {
                    let ptr_slice = &indirect_blocks[depth][ptr_offset..ptr_offset + BLKPTR_SIZE];
                    // SAFETY: Same invariants as traverse() - reading Blkptr from indirect block
                    current_bp =
                        unsafe { core::ptr::read_unaligned(ptr_slice.as_ptr() as *const Blkptr) };
                } else {
                    current_bp = Blkptr::zero();
                }
            }
        }

        // Update the leaf indirect block with the new data block pointer
        let leaf_idx = path_indices[level as usize - 1];
        let ptr_offset = leaf_idx * BLKPTR_SIZE;

        // SAFETY: Serializing Blkptr to bytes for storage in indirect block
        unsafe {
            let bp_bytes =
                core::slice::from_raw_parts(&new_bp as *const Blkptr as *const u8, BLKPTR_SIZE);
            let leaf = indirect_blocks.last_mut().unwrap();
            leaf[ptr_offset..ptr_offset + BLKPTR_SIZE].copy_from_slice(bp_bytes);
        }

        // Write back all modified indirect blocks (bottom-up)
        let mut child_bp = Blkptr::zero();

        for depth in (0..level as usize).rev() {
            let mut block_data = indirect_blocks[depth].clone();

            // If not the leaf level, update pointer to child
            if depth < (level - 1) as usize {
                let idx = path_indices[depth];
                let ptr_offset = idx * BLKPTR_SIZE;
                // SAFETY: Serializing Blkptr to bytes
                unsafe {
                    let bp_bytes = core::slice::from_raw_parts(
                        &child_bp as *const Blkptr as *const u8,
                        BLKPTR_SIZE,
                    );
                    block_data[ptr_offset..ptr_offset + BLKPTR_SIZE].copy_from_slice(bp_bytes);
                }
            }

            // Write this indirect block
            let written_bp = Pipeline::write_block_full(&block_data, &key, txg)?;

            if depth == 0 {
                // Top-level: update dnode
                dnode.blkptr[top_idx] = written_bp;
                if top_idx as u8 >= dnode.nblkptr {
                    dnode.nblkptr = top_idx as u8 + 1;
                }
            } else {
                child_bp = written_bp;
            }
        }

        Ok(())
    }

    /// Calculate maximum file size for indirection level
    pub fn max_file_size(indirection_level: u8) -> u64 {
        const BLOCK_SIZE: u64 = 4096;
        match indirection_level {
            0 => 3 * BLOCK_SIZE,
            1 => 3 * PTRS_PER_BLOCK * BLOCK_SIZE,
            2 => 3 * PTRS_PER_BLOCK * PTRS_PER_BLOCK * BLOCK_SIZE,
            3 => 3 * PTRS_PER_BLOCK * PTRS_PER_BLOCK * PTRS_PER_BLOCK * BLOCK_SIZE,
            _ => u64::MAX,
        }
    }

    /// Sync all dirty dnodes to disk (TXG commit)
    ///
    /// This implements the core of transaction group commit:
    /// 1. Collect all dirty dnodes
    /// 2. Write them to disk via Pipeline
    /// 3. Update TXG number (only on success)
    /// 4. Clear dirty flags (only on success)
    ///
    /// # Errors
    ///
    /// Returns an error if ANY dnode fails to sync. This is critical for
    /// data integrity - a partial sync would leave the filesystem inconsistent.
    ///
    /// Based on OpenZFS `dsl_dataset_sync` and `dmu_objset_sync`.
    pub fn txg_sync(&mut self, _dev_id: usize) -> FsResult<u64> {
        let current_txg = self.os_txg.load(Ordering::Acquire);

        crate::lcpfs_println!(
            "[ TXG  ] Syncing TXG {} ({} dnodes)",
            current_txg,
            self.os_dnodes.len()
        );

        // Use dataset encryption key if available, otherwise zero key (unencrypted)
        let key = self
            .os_encryption_key
            .as_ref()
            .map(|k| k.key)
            .unwrap_or([0u8; 32]);

        // First pass: attempt to write all dirty dnodes
        // Collect results to handle atomically
        let mut write_results: Vec<(u64, Result<(), FsError>)> = Vec::new();

        for (object_id, dnode) in self.os_dnodes.iter() {
            if dnode.dn_dirty {
                // Serialize dnode to disk format
                // SAFETY INVARIANTS:
                // 1. DnodePhys is #[repr(C)] with stable layout
                // 2. All fields are primitive types or arrays (no Drop, no references)
                // 3. dnode.dn_phys is valid, initialized DnodePhys on stack
                // 4. Slice lifetime ≤ loop iteration (does not escape)
                // 5. size_of::<DnodePhys>() matches actual struct size
                //
                // VERIFICATION: TODO - Prove no padding bytes in DnodePhys
                //
                // JUSTIFICATION:
                // DMU objects stored as DnodePhys on disk. Binary serialization
                // required for txg_sync. repr(C) ensures cross-version compatibility.
                let dnode_bytes = unsafe {
                    core::slice::from_raw_parts(
                        &dnode.dn_phys as *const DnodePhys as *const u8,
                        core::mem::size_of::<DnodePhys>(),
                    )
                };

                // Write via Pipeline (handles compression, checksums, encryption)
                let result = Pipeline::write_block(dnode_bytes, &key, current_txg).map(|_| ());

                write_results.push((*object_id, result));
            }
        }

        // Check for any failures
        let mut failed_objects: Vec<u64> = Vec::new();
        for (object_id, result) in &write_results {
            if result.is_err() {
                failed_objects.push(*object_id);
            }
        }

        if !failed_objects.is_empty() {
            crate::lcpfs_println!(
                "[ TXG  ] FAILED: {} dnodes failed to sync: {:?}",
                failed_objects.len(),
                failed_objects
            );
            return Err(FsError::IoError {
                vdev: 0,
                reason: "txg_sync failed: one or more dnodes could not be written",
            });
        }

        // All writes succeeded - now clear dirty flags
        let synced_count = write_results.len();
        for (object_id, _) in &write_results {
            if let Some(dnode) = self.os_dnodes.get_mut(object_id) {
                dnode.dn_dirty = false;
                dnode.dn_dirty_txg = 0;
            }
        }

        // BUG #9 FIX: Write meta-dnode and update rootbp
        // The meta-dnode contains references to all dnodes in this object set.
        // We serialize all dnode physical structures into a single block.
        if synced_count > 0 {
            // Collect all dnode physical structures
            let mut meta_block = Vec::new();
            let dnode_size = core::mem::size_of::<DnodePhys>();

            for (object_id, dnode) in self.os_dnodes.iter() {
                // Write object ID (8 bytes) + dnode phys
                meta_block.extend_from_slice(&object_id.to_le_bytes());
                // SAFETY INVARIANTS:
                // 1. DnodePhys is #[repr(C)] with stable layout
                // 2. dnode.dn_phys is valid DnodePhys
                // 3. dnode_size matches actual struct size
                let dnode_bytes = unsafe {
                    core::slice::from_raw_parts(
                        &dnode.dn_phys as *const DnodePhys as *const u8,
                        dnode_size,
                    )
                };
                meta_block.extend_from_slice(dnode_bytes);
            }

            // Write meta-dnode block to disk and get its block pointer
            match Pipeline::write_block_full(&meta_block, &key, current_txg) {
                Ok(meta_bp) => {
                    // Update the object set's root block pointer
                    self.os_rootbp = meta_bp;

                    crate::lcpfs_println!(
                        "[ TXG  ] Updated rootbp to DVA {:x}",
                        meta_bp.dva[0].offset
                    );

                    // Write the hyperblock with updated rootbp
                    let hb = Hyperblock {
                        magic: LCPFS_MAGIC,
                        version: 1,
                        txg: current_txg,
                        guid_sum: 0,  // TODO: compute from vdev GUIDs
                        timestamp: 0, // TODO: get current timestamp
                        rootbp: self.os_rootbp,
                    };

                    if let Err(e) = LcpfsFormatter::write_hyperblock(_dev_id, &hb) {
                        crate::lcpfs_println!("[ TXG  ] ERROR: Failed to write hyperblock: {}", e);
                        return Err(FsError::IoError {
                            vdev: _dev_id,
                            reason: "Failed to write hyperblock after txg_sync",
                        });
                    }

                    crate::lcpfs_println!("[ TXG  ] Hyperblock written for TXG {}", current_txg);
                }
                Err(e) => {
                    crate::lcpfs_println!("[ TXG  ] ERROR: Failed to write meta-dnode: {:?}", e);
                    return Err(e);
                }
            }
        }

        // Advance TXG only after successful sync
        let next_txg = current_txg + 1;
        self.os_txg.store(next_txg, Ordering::Release);

        crate::lcpfs_println!(
            "[ TXG  ] Synced {} dirty dnodes, advanced to TXG {}",
            synced_count,
            next_txg
        );

        Ok(current_txg)
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// OBJECT INFO
// ═══════════════════════════════════════════════════════════════════════════════

/// Object metadata returned by `dmu_object_info`.
///
/// Based on OpenZFS `dmu_object_info_t`.
#[derive(Debug, Clone, Copy)]
pub struct ObjectInfo {
    /// Object type
    pub doi_type: DmuObjectType,
    /// Bonus buffer type
    pub doi_bonus_type: DmuObjectType,
    /// Bonus buffer size
    pub doi_bonus_size: u32,
    /// Indirection level
    pub doi_indirection: u8,
    /// Data block size
    pub doi_data_block_size: u32,
    /// Physical blocks (512-byte units)
    pub doi_physical_blocks_512: u64,
    /// Maximum offset in object
    pub doi_max_offset: u64,
}

// ═══════════════════════════════════════════════════════════════════════════════
// TRANSACTION GROUP (TXG)
// ═══════════════════════════════════════════════════════════════════════════════

/// Transaction group states
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TxgState {
    /// TXG is open for new transactions
    Open,
    /// TXG is quiescing (waiting for transactions to complete)
    Quiescing,
    /// TXG is syncing to disk
    Syncing,
    /// TXG has been committed
    Committed,
}

/// Transaction group - atomic unit of on-disk consistency.
///
/// Based on OpenZFS TXG. Multiple transactions are batched into a TXG,
/// which is atomically committed to disk.
pub struct Txg {
    /// TXG number
    pub txg_id: u64,
    /// Current state
    pub txg_state: TxgState,
    /// Number of active transactions
    pub txg_tx_count: AtomicU64,
    /// Space used by this TXG
    pub txg_space_used: AtomicU64,
}

impl Txg {
    /// Create a new TXG
    pub fn new(id: u64) -> Self {
        Self {
            txg_id: id,
            txg_state: TxgState::Open,
            txg_tx_count: AtomicU64::new(0),
            txg_space_used: AtomicU64::new(0),
        }
    }

    /// Check if TXG is open for new transactions
    pub fn is_open(&self) -> bool {
        self.txg_state == TxgState::Open
    }

    /// Begin quiescing (stop accepting new transactions)
    pub fn quiesce(&mut self) {
        self.txg_state = TxgState::Quiescing;
    }

    /// Begin syncing to disk
    pub fn sync_start(&mut self) {
        self.txg_state = TxgState::Syncing;
    }

    /// Mark as committed
    pub fn commit(&mut self) {
        self.txg_state = TxgState::Committed;
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// CONVENIENCE FUNCTIONS (OpenZFS-style API)
// ═══════════════════════════════════════════════════════════════════════════════

/// Create a new transaction for an object set
///
/// Based on OpenZFS `dmu_tx_create`.
pub fn dmu_tx_create(objset: &Objset) -> DmuTx {
    DmuTx::create(objset.os_id)
}

/// Hold write operation in transaction
///
/// Based on OpenZFS `dmu_tx_hold_write`.
pub fn dmu_tx_hold_write(tx: &mut DmuTx, object: u64, offset: u64, length: u64) -> FsResult<()> {
    tx.hold_write(object, offset, length)
}

/// Hold free operation in transaction
///
/// Based on OpenZFS `dmu_tx_hold_free`.
pub fn dmu_tx_hold_free(tx: &mut DmuTx, object: u64, offset: u64, length: u64) -> FsResult<()> {
    tx.hold_free(object, offset, length)
}

/// Assign transaction to a TXG
///
/// Based on OpenZFS `dmu_tx_assign`.
pub fn dmu_tx_assign(tx: &mut DmuTx, objset: &Objset, wait: TxWaitType) -> FsResult<()> {
    tx.assign(objset, wait)
}

/// Commit a transaction
///
/// Based on OpenZFS `dmu_tx_commit`.
pub fn dmu_tx_commit(tx: DmuTx) {
    tx.commit()
}

/// Abort a transaction
///
/// Based on OpenZFS `dmu_tx_abort`.
pub fn dmu_tx_abort(tx: DmuTx) {
    tx.abort()
}

/// Allocate a new object
///
/// Based on OpenZFS `dmu_object_alloc`.
pub fn dmu_object_alloc(
    objset: &mut Objset,
    object_type: DmuObjectType,
    blocksize: u64,
    bonustype: DmuObjectType,
    bonuslen: u8,
    tx: &DmuTx,
) -> FsResult<u64> {
    objset.object_alloc(object_type, blocksize, bonustype, bonuslen, tx)
}

/// Free an object
///
/// Based on OpenZFS `dmu_object_free`.
pub fn dmu_object_free(objset: &mut Objset, object: u64, tx: &DmuTx) -> FsResult<()> {
    objset.object_free(object, tx)
}

/// Read from an object
///
/// Based on OpenZFS `dmu_read`.
pub fn dmu_read(objset: &mut Objset, object: u64, offset: u64, length: usize) -> FsResult<Vec<u8>> {
    objset.read(object, offset, length)
}

/// Write to an object
///
/// Based on OpenZFS `dmu_write`.
pub fn dmu_write(
    objset: &mut Objset,
    object: u64,
    offset: u64,
    data: &[u8],
    tx: &DmuTx,
) -> FsResult<()> {
    objset.write(object, offset, data, tx)
}

/// Get object info
///
/// Based on OpenZFS `dmu_object_info`.
pub fn dmu_object_info(objset: &Objset, object: u64) -> FsResult<ObjectInfo> {
    objset.object_info(object)
}

// ═══════════════════════════════════════════════════════════════════════════════
// LEGACY COMPATIBILITY (for existing code)
// ═══════════════════════════════════════════════════════════════════════════════

/// Legacy ObjectSet wrapper for compatibility
pub struct ObjectSet {
    /// Root block pointer
    pub root_bp: Blkptr,
}

impl ObjectSet {
    /// Read data from a dnode (legacy compatibility)
    pub fn read_dnode_data(dnode: &DnodePhys, offset: u64, length: usize) -> FsResult<Vec<u8>> {
        Objset::read_dnode_data(dnode, offset, length)
    }

    /// Write data to a dnode (legacy compatibility)
    pub fn write_dnode_data(
        dnode: &mut DnodePhys,
        offset: u64,
        data: &[u8],
        txg: u64,
    ) -> FsResult<()> {
        Objset::write_dnode_data(dnode, offset, data, txg)
    }

    /// Max file size (legacy compatibility)
    pub fn max_file_size(indirection_level: u8) -> u64 {
        Objset::max_file_size(indirection_level)
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// TESTS
// ═══════════════════════════════════════════════════════════════════════════════

#[cfg(test)]
mod tests {
    use super::*;

    /// Test indirection level calculation
    #[test]
    fn test_required_indirection_level() {
        // Direct blocks (0-2)
        assert_eq!(Objset::required_indirection_level(0), 0);
        assert_eq!(Objset::required_indirection_level(1), 0);
        assert_eq!(Objset::required_indirection_level(2), 0);

        // Single indirect (3-95)
        assert_eq!(Objset::required_indirection_level(3), 1);
        assert_eq!(Objset::required_indirection_level(32), 1);
        assert_eq!(Objset::required_indirection_level(95), 1);

        // Double indirect (96-3071)
        assert_eq!(Objset::required_indirection_level(96), 2);
        assert_eq!(Objset::required_indirection_level(1000), 2);
        assert_eq!(Objset::required_indirection_level(3071), 2);

        // Triple indirect (3072+)
        assert_eq!(Objset::required_indirection_level(3072), 3);
        assert_eq!(Objset::required_indirection_level(100000), 3);
    }

    /// Test maximum file size calculation
    #[test]
    fn test_max_file_size() {
        // Level 0: 3 * 4KB = 12KB
        assert_eq!(Objset::max_file_size(0), 3 * 4096);

        // Level 1: 3 * 32 * 4KB = 384KB
        assert_eq!(Objset::max_file_size(1), 3 * 32 * 4096);

        // Level 2: 3 * 32 * 32 * 4KB = 12MB
        assert_eq!(Objset::max_file_size(2), 3 * 32 * 32 * 4096);

        // Level 3: 3 * 32 * 32 * 32 * 4KB = 384MB
        assert_eq!(Objset::max_file_size(3), 3 * 32 * 32 * 32 * 4096);
    }

    /// Test transaction lifecycle
    #[test]
    fn test_transaction_lifecycle() {
        let objset = Objset::new(1);

        // Create transaction
        let mut tx = DmuTx::create(objset.os_id);
        assert!(!tx.is_assigned());
        assert_eq!(tx.txg(), 0);

        // Hold write
        tx.hold_write(1, 0, 4096).unwrap();
        assert!(tx.tx_space_towrite > 0);

        // Transaction should still be open
        assert!(!tx.is_assigned());
    }

    /// Test dnode allocation
    #[test]
    fn test_dnode_allocation() {
        let dnode = Dnode::allocate(
            1,
            DmuObjectType::PlainFileContents,
            4096,
            DmuObjectType::None,
            0,
        );

        assert_eq!(dnode.dn_object, 1);
        assert_eq!(dnode.object_type(), DmuObjectType::PlainFileContents);
        assert_eq!(dnode.dn_phys.indirection_levels, 0);
        assert_eq!(dnode.dn_phys.nblkptr, 1);
    }

    /// Test object set creation and object allocation
    #[test]
    fn test_objset_object_alloc() {
        let mut objset = Objset::new(1);

        // Need a committed transaction for allocation
        let mut tx = DmuTx::create(objset.os_id);
        tx.hold_write(DMU_NEW_OBJECT, 0, 0).unwrap();
        // Manually set assigned state for testing
        tx.tx_state = TxState::Assigned;
        tx.tx_txg = 1;

        let obj_id = objset
            .object_alloc(
                DmuObjectType::PlainFileContents,
                4096,
                DmuObjectType::None,
                0,
                &tx,
            )
            .unwrap();

        assert_eq!(obj_id, 1); // First object after meta-dnode
        assert!(objset.os_dnodes.contains_key(&obj_id));
    }

    /// Test object info retrieval
    #[test]
    fn test_object_info() {
        let mut objset = Objset::new(1);

        let mut tx = DmuTx::create(objset.os_id);
        tx.tx_state = TxState::Assigned;
        tx.tx_txg = 1;

        let obj_id = objset
            .object_alloc(
                DmuObjectType::PlainFileContents,
                4096,
                DmuObjectType::None,
                0,
                &tx,
            )
            .unwrap();

        let info = objset.object_info(obj_id).unwrap();
        assert_eq!(info.doi_type, DmuObjectType::PlainFileContents);
        assert_eq!(info.doi_indirection, 0);
    }

    /// Test TXG state transitions
    #[test]
    fn test_txg_states() {
        let mut txg = Txg::new(1);
        assert!(txg.is_open());
        assert_eq!(txg.txg_state, TxgState::Open);

        txg.quiesce();
        assert_eq!(txg.txg_state, TxgState::Quiescing);
        assert!(!txg.is_open());

        txg.sync_start();
        assert_eq!(txg.txg_state, TxgState::Syncing);

        txg.commit();
        assert_eq!(txg.txg_state, TxgState::Committed);
    }

    /// Test DmuBuf operations
    #[test]
    fn test_dmu_buf() {
        let mut buf = DmuBuf::new(1, 0, 4096);
        assert_eq!(buf.db_object, 1);
        assert_eq!(buf.db_offset, 0);
        assert_eq!(buf.db_size, 4096);
        assert!(!buf.db_dirty);

        buf.mark_dirty(1);
        assert!(buf.db_dirty);
        assert_eq!(buf.db_dirty_txg, 1);
    }

    /// Test DmuBuf with data
    #[test]
    fn test_dmu_buf_with_data() {
        let data = vec![1, 2, 3, 4, 5];
        let buf = DmuBuf::with_data(1, 0, data.clone());

        assert_eq!(buf.db_data, data);
        assert_eq!(buf.db_size, 5);
    }

    /// Test Dnode hold/release
    #[test]
    fn test_dnode_hold_rele() {
        let dnode = Dnode::new(1, DnodePhys::zero());
        assert_eq!(dnode.dn_holds.load(Ordering::Acquire), 0);

        dnode.hold();
        assert_eq!(dnode.dn_holds.load(Ordering::Acquire), 1);

        dnode.hold();
        assert_eq!(dnode.dn_holds.load(Ordering::Acquire), 2);

        let count = dnode.rele();
        assert_eq!(count, 1);
        assert_eq!(dnode.dn_holds.load(Ordering::Acquire), 1);
    }

    /// Test block size configuration
    #[test]
    fn test_block_size_config() {
        // Test valid block sizes
        assert!(set_default_block_size(4096).is_ok());
        assert_eq!(get_block_size(), 4096);

        assert!(set_default_block_size(512).is_ok());
        assert_eq!(get_block_size(), 512);

        assert!(set_default_block_size(16 * 1024 * 1024).is_ok());
        assert_eq!(get_block_size(), 16 * 1024 * 1024);

        // Test invalid block sizes
        assert!(set_default_block_size(0).is_err());
        assert!(set_default_block_size(1000).is_err()); // Not power of 2
        assert!(set_default_block_size(256).is_err()); // Below minimum
        assert!(set_default_block_size(32 * 1024 * 1024).is_err()); // Above maximum

        // Reset to default
        let _ = set_default_block_size(DMU_DEFAULT_BLOCK_SIZE);
    }
}