fstool 0.4.0

Build disk images and filesystems (ext2/3/4, MBR, GPT) from a directory tree and TOML spec, in the spirit of genext2fs.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
//! APFS in-place read/write file handle.
//!
//! Backs [`crate::fs::Filesystem::open_file_rw`] on an APFS volume that
//! was opened via [`super::Apfs::open_writable`]. Scope:
//!
//! - **Existing-file edits only.** `open_file_rw(create = true)` is
//!   rejected — the handle assumes the catalog already contains the
//!   target file. Whole-file create / remove / rename remain
//!   unsupported.
//! - **Single-extent semantics.** On sync, the file's entire byte
//!   range is rewritten as a single new fresh extent. We don't try to
//!   patch existing extents in place because APFS extents are
//!   immutable units of the omap-resolved fs-tree; doing partial COW
//!   would require splitting + re-stitching the extent record set,
//!   which is deferred.
//! - **Checkpoint-bounded.** Each `sync()` consumes one xp_desc slot.
//!   The format-time xp_desc area has [`super::write::XP_DESC_BLOCKS`]
//!   slots; we use one on format and the remainder are spare. Once
//!   they're exhausted further syncs return `Unsupported`.
//!
//! ## Crash safety
//!
//! On `sync()` we COW the whole metadata stack:
//!
//! 1. Allocate fresh blocks past the previous bump high-water for the
//!    rewritten file extent.
//! 2. Allocate fresh blocks for new fs-tree leaves, the new volume
//!    omap, a new APSB, and a new container omap.
//! 3. Write the new NXSB into the next free xp_desc slot — and only
//!    then is the new checkpoint discoverable.
//!
//! A crash between steps 1–2 leaves the old NXSB pointing at the old
//! metadata stack, so the next open sees the previous checkpoint
//! intact. Step 3's NXSB write is the only commit point: it's a
//! single-block write, and the reader picks the highest-xid valid
//! NXSB in the xp_desc area, so a torn NXSB block (failed Fletcher-64)
//! falls back to the previous slot.

use std::io::{Read, Seek, SeekFrom, Write};

use crate::Result;
use crate::block::BlockDevice;
use crate::fs::FileHandle;

use super::Apfs;
use super::ApfsState;
use super::fstree::FsTreeCtx;
use super::jrec::{
    APFS_TYPE_DSTREAM_ID, APFS_TYPE_FILE_EXTENT, APFS_TYPE_INODE, INO_EXT_TYPE_DSTREAM,
    J_INODE_VAL_FIXED_SIZE, OBJ_ID_MASK, OBJ_TYPE_SHIFT,
};
use super::read_at_paddr;
use super::write;

/// Read/write handle for one APFS regular file.
///
/// The handle holds `&mut Apfs` + `&mut dyn BlockDevice` for its full
/// lifetime. On `Drop` it does **not** auto-sync — call `sync()` to
/// commit the new checkpoint. Dropping without sync silently
/// discards every pending byte (mirroring `std::fs::File`).
pub struct ApfsFileHandle<'a> {
    fs: &'a mut Apfs,
    dev: &'a mut dyn BlockDevice,
    /// Target inode object id (file_id from the drec record we resolved).
    target_oid: u64,
    /// In-memory image of the file's current bytes. Populated from the
    /// existing extents at open time; mutated by `Write::write` and
    /// `set_len`.
    contents: Vec<u8>,
    /// Read / write cursor (byte offset into `contents`).
    pos: u64,
    /// True once any mutation has happened since the last sync. Cleared
    /// on `sync()`.
    dirty: bool,
}

impl<'a> ApfsFileHandle<'a> {
    /// Resolve `path`, load the file's current bytes into RAM, and
    /// return a handle ready for reads/writes.
    pub(super) fn open(
        fs: &'a mut Apfs,
        dev: &'a mut dyn BlockDevice,
        path: &str,
        flags: crate::fs::OpenFlags,
    ) -> Result<Self> {
        // Refuse if not in write mode.
        match &fs.state {
            ApfsState::Write(_) => {}
            ApfsState::Read(_) => {
                return Err(crate::Error::Unsupported(
                    "apfs: open_file_rw requires Apfs::open_writable (not Apfs::open)".into(),
                ));
            }
            ApfsState::PendingWrite(_) => {
                return Err(crate::Error::Unsupported(
                    "apfs: open_file_rw is not available in pending-write mode".into(),
                ));
            }
        }

        // Resolve path → oid via fs's reader API.
        let target_oid = fs.resolve_path_to_oid(dev, path)?;

        // Read all current bytes.
        let mut contents = {
            let mut r = fs.open_file_reader(dev, path)?;
            let mut buf = Vec::new();
            std::io::Read::read_to_end(&mut r, &mut buf)
                .map_err(|e| crate::Error::Io(std::io::Error::other(e)))?;
            buf
        };

        if flags.truncate {
            contents.clear();
        }
        let pos = if flags.append {
            contents.len() as u64
        } else {
            0
        };

        Ok(Self {
            fs,
            dev,
            target_oid,
            contents,
            pos,
            // Truncate counts as a mutation that needs committing on
            // sync; create=false guarantees we didn't synthesize a new
            // inode here.
            dirty: flags.truncate,
        })
    }
}

impl<'a> Read for ApfsFileHandle<'a> {
    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
        if self.pos >= self.contents.len() as u64 {
            return Ok(0);
        }
        let start = self.pos as usize;
        let end = (start + buf.len()).min(self.contents.len());
        let n = end - start;
        buf[..n].copy_from_slice(&self.contents[start..end]);
        self.pos += n as u64;
        Ok(n)
    }
}

impl<'a> Write for ApfsFileHandle<'a> {
    fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
        if buf.is_empty() {
            return Ok(0);
        }
        let end = (self.pos as usize)
            .checked_add(buf.len())
            .ok_or_else(|| std::io::Error::other("apfs: write offset overflow"))?;
        if end > self.contents.len() {
            self.contents.resize(end, 0);
        }
        let start = self.pos as usize;
        self.contents[start..end].copy_from_slice(buf);
        self.pos = end as u64;
        self.dirty = true;
        Ok(buf.len())
    }

    fn flush(&mut self) -> std::io::Result<()> {
        // Per the FileHandle contract, `flush` is a hint; we leave
        // bytes in `contents` until `sync()` actually commits a
        // checkpoint. (Eagerly committing a checkpoint per Write::flush
        // would burn xp_desc slots fast.)
        Ok(())
    }
}

impl<'a> Seek for ApfsFileHandle<'a> {
    fn seek(&mut self, pos: SeekFrom) -> std::io::Result<u64> {
        let target: i128 = match pos {
            SeekFrom::Start(n) => n as i128,
            SeekFrom::Current(d) => self.pos as i128 + d as i128,
            SeekFrom::End(d) => self.contents.len() as i128 + d as i128,
        };
        if target < 0 {
            return Err(std::io::Error::new(
                std::io::ErrorKind::InvalidInput,
                "apfs: seek to negative offset",
            ));
        }
        self.pos = target as u64;
        Ok(self.pos)
    }
}

impl<'a> FileHandle for ApfsFileHandle<'a> {
    fn len(&self) -> u64 {
        self.contents.len() as u64
    }

    fn set_len(&mut self, new_len: u64) -> Result<()> {
        let new_len = new_len as usize;
        if new_len != self.contents.len() {
            self.contents.resize(new_len, 0);
            self.dirty = true;
            if self.pos > new_len as u64 {
                self.pos = new_len as u64;
            }
        }
        Ok(())
    }

    fn sync(&mut self) -> Result<()> {
        if !self.dirty {
            return Ok(());
        }
        commit_checkpoint(self.fs, self.dev, self.target_oid, &self.contents)?;
        self.dirty = false;
        Ok(())
    }
}

/// Commit a checkpoint that replaces the file at `target_oid` with the
/// given `new_bytes`. All other inodes / directories / xattrs are
/// preserved exactly.
///
/// On return, `fs.state` is refreshed to a fresh write state that
/// reflects the newly-written checkpoint.
fn commit_checkpoint(
    fs: &mut Apfs,
    dev: &mut dyn BlockDevice,
    target_oid: u64,
    new_bytes: &[u8],
) -> Result<()> {
    // Snapshot the writer-facing checkpoint metadata.
    let (
        block_size,
        total_blocks,
        volume_name,
        container_uuid,
        volume_uuid,
        cur_xid,
        next_xp_desc_slot,
        bump_high_water,
        next_oid,
        num_files,
        num_directories,
        num_symlinks,
    ) = match &fs.state {
        ApfsState::Write(w) => (
            fs.block_size,
            w.total_blocks,
            w.volume_name.clone(),
            w.container_uuid,
            w.volume_uuid,
            w.cur_xid,
            w.next_xp_desc_slot,
            w.bump_high_water,
            w.next_oid,
            w.num_files,
            w.num_directories,
            w.num_symlinks,
        ),
        _ => {
            return Err(crate::Error::Unsupported(
                "apfs: sync called outside write state".into(),
            ));
        }
    };

    // ---- 1. Dump every existing fs-tree record into RAM ----
    let mut records = dump_all_records(fs, dev)?;

    // ---- 2. Mutate records: replace the target file's extents +
    //         inode dstream size; drop dstream_id refcnt; we'll
    //         rebuild them.
    records.retain(|(k, _)| {
        if k.len() < 8 {
            return true;
        }
        let hdr = u64::from_le_bytes(k[0..8].try_into().unwrap());
        let oid = hdr & OBJ_ID_MASK;
        let kind = (hdr >> OBJ_TYPE_SHIFT) as u8;
        // Drop the target's existing FILE_EXTENT / DSTREAM_ID records;
        // we'll re-insert below.
        !(oid == target_oid && (kind == APFS_TYPE_FILE_EXTENT || kind == APFS_TYPE_DSTREAM_ID))
    });

    // Patch the inode record (which is keyed by (target_oid, INODE)).
    let new_size = new_bytes.len() as u64;
    for (k, v) in records.iter_mut() {
        if k.len() < 8 {
            continue;
        }
        let hdr = u64::from_le_bytes(k[0..8].try_into().unwrap());
        let oid = hdr & OBJ_ID_MASK;
        let kind = (hdr >> OBJ_TYPE_SHIFT) as u8;
        if oid == target_oid && kind == APFS_TYPE_INODE {
            patch_inode_dstream_size(v, new_size, block_size as u64);
        }
    }

    // ---- 3. Bump-allocate a fresh extent for the new file bytes ----
    let bs_u64 = block_size as u64;
    let (extent_paddr, extent_blocks) = if new_size == 0 {
        (0u64, 0u64)
    } else {
        let blocks = new_size.div_ceil(bs_u64);
        let start = bump_high_water;
        let end = start.checked_add(blocks).ok_or_else(|| {
            crate::Error::InvalidArgument("apfs: extent allocation overflow".into())
        })?;
        if end > total_blocks {
            return Err(crate::Error::InvalidArgument(format!(
                "apfs: not enough free blocks to write {new_size} bytes \
                 (need {blocks}, have {})",
                total_blocks - bump_high_water
            )));
        }
        (start, blocks)
    };

    // Write the bytes into the new extent.
    if new_size > 0 {
        let mut blk = vec![0u8; block_size as usize];
        for i in 0..extent_blocks {
            let off = (i as usize) * block_size as usize;
            let end = (off + block_size as usize).min(new_size as usize);
            blk.fill(0);
            blk[..end - off].copy_from_slice(&new_bytes[off..end]);
            dev.write_at((extent_paddr + i) * bs_u64, &blk)?;
        }
    }

    // ---- 4. Re-insert FILE_EXTENT + DSTREAM_ID records for the target.
    if new_size > 0 {
        let mut key = vec![0u8; 16];
        let hdr = ((APFS_TYPE_FILE_EXTENT as u64) << OBJ_TYPE_SHIFT) | (target_oid & OBJ_ID_MASK);
        key[0..8].copy_from_slice(&hdr.to_le_bytes());
        key[8..16].copy_from_slice(&0u64.to_le_bytes()); // logical_addr
        let mut val = vec![0u8; 24];
        let alloc = extent_blocks * bs_u64;
        val[0..8].copy_from_slice(&alloc.to_le_bytes());
        val[8..16].copy_from_slice(&extent_paddr.to_le_bytes());
        records.push((key, val));

        let mut dkey = vec![0u8; 8];
        let dhdr = ((APFS_TYPE_DSTREAM_ID as u64) << OBJ_TYPE_SHIFT) | (target_oid & OBJ_ID_MASK);
        dkey.copy_from_slice(&dhdr.to_le_bytes());
        let dval = vec![0u8; 4];
        records.push((dkey, dval));
    }

    // ---- 5. Construct a checkpoint-mode writer and emit. ----
    let new_bump_start = bump_high_water + extent_blocks;
    let new_xid = cur_xid + 1;
    {
        let mut w = write::ApfsWriter::new_checkpoint(
            dev,
            total_blocks,
            block_size,
            &volume_name,
            container_uuid,
            volume_uuid,
            new_xid,
            next_xp_desc_slot,
            new_bump_start,
            next_oid,
            num_files,
            num_directories,
            num_symlinks,
        )?;
        for (k, v) in records {
            w.push_raw_record(k, v);
        }
        w.finish()?;
    }

    // ---- 6. Refresh `fs.state` by reopening the (now-newer)
    //         checkpoint as a fresh write state.
    let refreshed = Apfs::open_writable(dev)?;
    debug_assert!(matches!(refreshed.state, ApfsState::Write(_)));
    fs.state = refreshed.state;
    fs.volume_name = refreshed.volume_name;
    Ok(())
}

/// Walk every record in the fs-tree and return them as `(key, val)`
/// byte pairs. Sorted by `(oid, kind, tail)` per APFS canonical order
/// because `RangeScan` yields entries in that order anyway, but we
/// re-sort defensively.
fn dump_all_records(fs: &mut Apfs, dev: &mut dyn BlockDevice) -> Result<Vec<(Vec<u8>, Vec<u8>)>> {
    // Use the read state's caches.
    let rs = fs.read_state()?;
    let block_size = fs.block_size;
    let mut ctx = rs.fs_ctx.borrow_mut();
    let mut out = Vec::new();
    // We scan once per (oid, kind) pair we expect, but that's expensive.
    // The simpler tactic: walk every distinct oid we see by starting
    // each scan at (0, INODE) and following the natural tree order.
    // But RangeScan stops at the prefix boundary, so we'd need to
    // restart per (oid, kind).
    //
    // Instead, walk by repeated range-scans across each known record
    // kind. To know which oids exist, first walk every drec record
    // under root to enumerate dirs+files, then recurse into subdirs.
    // That's complex. The fast workaround: walk the tree via the leaf
    // iterator directly by re-using RangeScan with a sentinel start
    // (oid=0, kind=0) that matches the empty prefix.
    //
    // RangeScan's `stop_oid/stop_kind` are derived from the start
    // target, so scanning with oid=0,kind=0 stops immediately on the
    // first non-matching key. To dump the *whole* tree we instead walk
    // each leaf physically. The fs-tree has at most one internal level
    // — so the omap tells us where the leaves live.

    // Pull every fs-tree leaf paddr out of the omap.
    let leaf_paddrs = collect_fs_leaf_paddrs(&rs.fsroot_block, &mut ctx, &mut |paddr, buf| {
        read_at_paddr(dev, paddr, block_size, buf)
    })?;
    drop(ctx);

    if leaf_paddrs.is_empty() {
        // Single root-leaf tree: walk records directly out of the
        // cached fsroot_block.
        let node = super::btree::BTreeNode::decode(&rs.fsroot_block)?;
        if node.is_leaf() {
            for i in 0..node.nkeys {
                let (kb, vb) = node.entry_at(i, 0, 0)?;
                out.push((kb.to_vec(), vb.to_vec()));
            }
        }
    } else {
        for paddr in leaf_paddrs {
            let mut blk = vec![0u8; block_size as usize];
            read_at_paddr(dev, paddr, block_size, &mut blk)?;
            let node = super::btree::BTreeNode::decode(&blk)?;
            if !node.is_leaf() {
                continue;
            }
            for i in 0..node.nkeys {
                let (kb, vb) = node.entry_at(i, 0, 0)?;
                out.push((kb.to_vec(), vb.to_vec()));
            }
        }
    }

    // Defensive sort by (oid, kind, tail).
    out.sort_by(|a, b| {
        let ka = sort_key_for(&a.0);
        let kb = sort_key_for(&b.0);
        ka.cmp(&kb)
    });
    Ok(out)
}

fn sort_key_for(key: &[u8]) -> (u64, u8, Vec<u8>) {
    if key.len() < 8 {
        return (0, 0, key.to_vec());
    }
    let hdr = u64::from_le_bytes(key[0..8].try_into().unwrap());
    let oid = hdr & OBJ_ID_MASK;
    let kind = (hdr >> OBJ_TYPE_SHIFT) as u8;
    let tail = key[8..].to_vec();
    (oid, kind, tail)
}

/// Walk the fs-tree from `fsroot_block` and return the physical block
/// addresses of every leaf node. Internal-node children are resolved
/// through the volume omap inside `ctx`.
fn collect_fs_leaf_paddrs<F>(
    fsroot_block: &[u8],
    ctx: &mut FsTreeCtx,
    read_block: &mut F,
) -> Result<Vec<u64>>
where
    F: FnMut(u64, &mut [u8]) -> Result<()>,
{
    let root_node = super::btree::BTreeNode::decode(fsroot_block)?;
    if root_node.is_leaf() {
        // The root is itself a leaf — we don't know its own paddr,
        // but we can return an empty list and have the caller walk
        // the root's records directly. To keep the API uniform,
        // synthesize a single leaf by writing the root bytes to a
        // scratch buffer at "phantom paddr" 0 ... actually we just
        // expose the records here.
        //
        // The cleaner path: collect_fs_leaf_paddrs returns physical
        // paddrs of leaves; if the tree is a single root-leaf we
        // return an empty Vec and have the caller fall back to
        // reading the root directly. Marker: 0 paddr.
        return Ok(vec![]);
    }
    // Internal root: each child is a virtual oid (kvloc, 8-byte child
    // vid value). Resolve through the omap.
    let mut out = Vec::with_capacity(root_node.nkeys as usize);
    for i in 0..root_node.nkeys {
        let (_, vb) = root_node.entry_at(i, 0, 8)?;
        if vb.len() < 8 {
            continue;
        }
        let child_vid = u64::from_le_bytes(vb[0..8].try_into().unwrap());
        let paddr = ctx.resolve_vid(child_vid, read_block)?;
        let mut child_blk = vec![0u8; ctx.block_size];
        read_block(paddr, &mut child_blk)?;
        let child_node = super::btree::BTreeNode::decode(&child_blk)?;
        if child_node.is_leaf() {
            out.push(paddr);
        } else {
            // We don't support deeper trees here — fall back to
            // surfacing what we found.
            return Err(crate::Error::Unsupported(
                "apfs: fs-tree depth > 2 isn't supported by the rw path".into(),
            ));
        }
    }
    Ok(out)
}

/// Patch an inode value's DSTREAM xfield to advertise `new_size` and
/// the allocated size matching `block_size`-aligned-up.
fn patch_inode_dstream_size(val: &mut [u8], new_size: u64, block_size: u64) {
    if val.len() < J_INODE_VAL_FIXED_SIZE {
        return;
    }
    // The fixed inode value carries `total_size` at offset 84..92.
    val[84..92].copy_from_slice(&new_size.to_le_bytes());
    // Find the trailing xfield blob (offset 92..) and update the DSTREAM
    // value's size + alloced_size fields.
    if val.len() <= J_INODE_VAL_FIXED_SIZE {
        return;
    }
    let xfields = &mut val[J_INODE_VAL_FIXED_SIZE..];
    if xfields.len() < 4 {
        return;
    }
    let num_exts = u16::from_le_bytes(xfields[0..2].try_into().unwrap()) as usize;
    // Iterate x_field_t headers; the value blob follows after all
    // headers.
    let headers_end = 4 + num_exts * 4;
    if xfields.len() < headers_end {
        return;
    }
    let mut value_cursor = headers_end;
    for i in 0..num_exts {
        let hdr_off = 4 + i * 4;
        let kind = xfields[hdr_off];
        let size =
            u16::from_le_bytes(xfields[hdr_off + 2..hdr_off + 4].try_into().unwrap()) as usize;
        if kind == INO_EXT_TYPE_DSTREAM && size >= 16 {
            // j_dstream_t: u64 size, u64 alloced_size, ...
            let alloc = new_size.div_ceil(block_size) * block_size;
            xfields[value_cursor..value_cursor + 8].copy_from_slice(&new_size.to_le_bytes());
            xfields[value_cursor + 8..value_cursor + 16].copy_from_slice(&alloc.to_le_bytes());
            return;
        }
        value_cursor += size.next_multiple_of(8);
        if value_cursor > xfields.len() {
            return;
        }
    }
}

// Tests for the rw module live alongside the rest of the apfs
// integration tests in src/fs/apfs/mod.rs (round-trip format → mutate →
// reopen → read).