Skip to main content

btrfs_cli/rescue/
clear_space_cache.rs

1use crate::{RunContext, Runnable, util::is_mounted};
2use anyhow::{Context, Result, bail};
3use btrfs_disk::{
4    items::{FileExtentBody, FileExtentItem},
5    raw,
6    tree::{DiskKey, KeyType},
7};
8use btrfs_transaction::{
9    allocation,
10    filesystem::Filesystem,
11    items,
12    path::BtrfsPath,
13    search::{self, SearchIntent},
14    transaction::Transaction,
15};
16use clap::Parser;
17use std::{
18    fs::OpenOptions,
19    io::{Read, Seek, Write},
20    path::PathBuf,
21};
22
23/// Free space tree object ID (tree 10).
24const FREE_SPACE_TREE_OBJECTID: u64 =
25    raw::BTRFS_FREE_SPACE_TREE_OBJECTID as u64;
26
27/// Root tree ID.
28const ROOT_TREE_OBJECTID: u64 = 1;
29
30/// Special objectid that holds v1 free space cache headers
31/// (`BTRFS_FREE_SPACE_OBJECTID` == -11 sign-extended).
32const FREE_SPACE_OBJECTID: u64 =
33    (raw::BTRFS_FREE_SPACE_OBJECTID as i64).cast_unsigned();
34
35/// The v1 free space header item is stored under key type 0 (no
36/// dedicated `KeyType` variant; this matches the kernel and
37/// btrfs-progs).
38const FREE_SPACE_HEADER_KEY_TYPE: u8 = 0;
39
40/// Free space cache version to clear.
41#[derive(Debug, Clone, Copy, PartialEq, Eq, clap::ValueEnum)]
42pub enum SpaceCacheVersion {
43    V1,
44    V2,
45}
46
47/// Completely remove the v1 or v2 free space cache
48///
49/// For v2, drops the FREE_SPACE_TREE root and clears the
50/// FREE_SPACE_TREE and FREE_SPACE_TREE_VALID compat_ro flags so that
51/// the kernel rebuilds the tree on the next mount with `space_cache=v2`.
52///
53/// For v1, see Stage G in transaction/PLAN.md: clearing v1 requires
54/// freeing data extents owned by the hidden free-space-cache inodes,
55/// which the transaction crate does not yet support.
56///
57/// The device must not be mounted.
58#[derive(Parser, Debug)]
59pub struct RescueClearSpaceCacheCommand {
60    /// Free space cache version to remove
61    version: SpaceCacheVersion,
62
63    /// Path to the btrfs device
64    device: PathBuf,
65}
66
67/// Recursively walk a tree starting at `bytenr`, collecting every block
68/// address (root, internal nodes, leaves).
69fn collect_tree_blocks<R: Read + Write + Seek>(
70    fs: &mut Filesystem<R>,
71    bytenr: u64,
72    out: &mut Vec<(u64, u8)>,
73) -> Result<()> {
74    let eb = fs
75        .read_block(bytenr)
76        .with_context(|| format!("failed to read tree block at {bytenr}"))?;
77    let level = eb.level();
78    out.push((bytenr, level));
79
80    if eb.is_node() {
81        let nritems = eb.nritems() as usize;
82        for slot in 0..nritems {
83            let child = eb.key_ptr_blockptr(slot);
84            collect_tree_blocks(fs, child, out)?;
85        }
86    }
87    Ok(())
88}
89
90/// Look up the `FREE_SPACE_HEADER` for one block group and, if
91/// present, walk the cache inode's `EXTENT_DATA` items to collect a
92/// [`V1CacheEntry`].
93fn read_v1_cache_entry<R: Read + Write + Seek>(
94    fs: &mut Filesystem<R>,
95    bg_start: u64,
96) -> Result<Option<V1CacheEntry>> {
97    // Step 1: find the FREE_SPACE_HEADER and parse the embedded
98    // location disk_key to get the inode number.
99    let header_key = DiskKey {
100        objectid: FREE_SPACE_OBJECTID,
101        key_type: KeyType::from_raw(FREE_SPACE_HEADER_KEY_TYPE),
102        offset: bg_start,
103    };
104    let mut path = BtrfsPath::new();
105    let found = search::search_slot(
106        None,
107        fs,
108        ROOT_TREE_OBJECTID,
109        &header_key,
110        &mut path,
111        SearchIntent::ReadOnly,
112        false,
113    )
114    .context("failed to search root tree for v1 cache header")?;
115    if !found {
116        path.release();
117        return Ok(None);
118    }
119    let leaf = path.nodes[0]
120        .as_ref()
121        .ok_or_else(|| anyhow::anyhow!("no leaf in v1 header path"))?;
122    let payload = leaf.item_data(path.slots[0]);
123    if payload.len() < 17 {
124        path.release();
125        bail!("FREE_SPACE_HEADER for bg {bg_start} truncated");
126    }
127    // The header begins with a btrfs_disk_key (17 bytes):
128    //   u64 objectid | u8 type | u64 offset
129    let ino = u64::from_le_bytes(payload[0..8].try_into().unwrap());
130    path.release();
131
132    // Step 2: walk the cache inode's EXTENT_DATA items in the root
133    // tree, recording (file_offset, disk_bytenr, disk_num_bytes) for
134    // each non-inline regular extent.
135    let mut extents: Vec<V1Extent> = Vec::new();
136
137    let start = DiskKey {
138        objectid: ino,
139        key_type: KeyType::ExtentData,
140        offset: 0,
141    };
142    let mut path = BtrfsPath::new();
143    search::search_slot(
144        None,
145        fs,
146        ROOT_TREE_OBJECTID,
147        &start,
148        &mut path,
149        SearchIntent::ReadOnly,
150        false,
151    )
152    .context("failed to search root tree for v1 cache extents")?;
153
154    'walk: loop {
155        let Some(leaf) = path.nodes[0].as_ref() else {
156            break;
157        };
158        let nritems = leaf.nritems() as usize;
159        if path.slots[0] >= nritems {
160            if !search::next_leaf(fs, &mut path).context("next_leaf failed")? {
161                break;
162            }
163            continue;
164        }
165        let key = leaf.item_key(path.slots[0]);
166        if key.objectid != ino {
167            break 'walk;
168        }
169        if key.key_type == KeyType::ExtentData {
170            let data = leaf.item_data(path.slots[0]);
171            let fei = FileExtentItem::parse(data).ok_or_else(|| {
172                anyhow::anyhow!(
173                    "malformed FILE_EXTENT for v1 cache inode {ino} offset {}",
174                    key.offset
175                )
176            })?;
177            match fei.body {
178                FileExtentBody::Regular {
179                    disk_bytenr,
180                    disk_num_bytes,
181                    ..
182                } => {
183                    extents.push(V1Extent {
184                        file_offset: key.offset,
185                        disk_bytenr,
186                        disk_num_bytes,
187                    });
188                }
189                FileExtentBody::Inline { .. } => {
190                    // Inline extents have no separate data extent;
191                    // record with disk_bytenr=0 so the apply pass
192                    // still deletes the EXTENT_DATA item but skips
193                    // the data ref drop.
194                    extents.push(V1Extent {
195                        file_offset: key.offset,
196                        disk_bytenr: 0,
197                        disk_num_bytes: 0,
198                    });
199                }
200            }
201        }
202        path.slots[0] += 1;
203    }
204    path.release();
205
206    Ok(Some(V1CacheEntry { ino, extents }))
207}
208
209/// Delete a single item identified by an exact key. Returns `false`
210/// (without erroring) if the item is missing, matching the C
211/// reference's tolerant behaviour for the cache inode item.
212fn delete_one_item<R: Read + Write + Seek>(
213    trans: &mut Transaction<R>,
214    fs: &mut Filesystem<R>,
215    tree_id: u64,
216    key: &DiskKey,
217) -> Result<bool> {
218    let mut path = BtrfsPath::new();
219    let found = search::search_slot(
220        Some(trans),
221        fs,
222        tree_id,
223        key,
224        &mut path,
225        SearchIntent::Delete,
226        true,
227    )
228    .with_context(|| {
229        format!("failed to search for {key:?} in tree {tree_id}")
230    })?;
231    if !found {
232        path.release();
233        return Ok(false);
234    }
235    let leaf = path.nodes[0]
236        .as_mut()
237        .ok_or_else(|| anyhow::anyhow!("delete_one_item: no leaf in path"))?;
238    items::del_items(leaf, path.slots[0], 1);
239    fs.mark_dirty(leaf);
240    path.release();
241    Ok(true)
242}
243
244impl Runnable for RescueClearSpaceCacheCommand {
245    fn run(&self, _ctx: &RunContext) -> Result<()> {
246        if is_mounted(&self.device) {
247            bail!("{} is currently mounted", self.device.display());
248        }
249
250        match self.version {
251            SpaceCacheVersion::V1 => self.clear_v1(),
252            SpaceCacheVersion::V2 => self.clear_v2(),
253        }
254    }
255}
256
257/// One free space cache file referenced from a block group's
258/// `FREE_SPACE_HEADER`. Collected during the read pass and consumed
259/// (via fresh COW searches) during the apply pass.
260struct V1CacheEntry {
261    /// Inode number that holds the cache file in the root tree.
262    ino: u64,
263    /// File-extent records found under that inode.
264    extents: Vec<V1Extent>,
265}
266
267struct V1Extent {
268    /// File offset key for this `EXTENT_DATA` item.
269    file_offset: u64,
270    /// Disk bytenr of the referenced extent (0 = hole/inline, skip).
271    disk_bytenr: u64,
272    /// On-disk byte length of the referenced extent.
273    disk_num_bytes: u64,
274}
275
276impl RescueClearSpaceCacheCommand {
277    /// Clear the v1 free space cache: for every block group, find its
278    /// `FREE_SPACE_HEADER` in the root tree, free the data extents
279    /// owned by the cache inode, and delete the cache items
280    /// (`FREE_SPACE_HEADER`, `EXTENT_DATA`s, `INODE_ITEM`).
281    ///
282    /// Mirrors the algorithm in `btrfs_clear_free_space_cache` from
283    /// btrfs-progs `kernel-shared/free-space-cache.c`, except that
284    /// the entire run happens in a single transaction (the C code
285    /// commits in clusters of 16 block groups for very large
286    /// filesystems).
287    fn clear_v1(&self) -> Result<()> {
288        let file = OpenOptions::new()
289            .read(true)
290            .write(true)
291            .open(&self.device)
292            .with_context(|| {
293                format!("failed to open '{}'", self.device.display())
294            })?;
295
296        let mut fs = Filesystem::open(file).with_context(|| {
297            format!("failed to open filesystem on '{}'", self.device.display())
298        })?;
299
300        // Pass 1: scan every block group for a v1 cache header and
301        // collect the work list before mutating anything.
302        let block_groups = allocation::load_block_groups(&mut fs)
303            .context("failed to load block groups")?;
304
305        let mut entries: Vec<(u64, V1CacheEntry)> = Vec::new();
306        for bg in &block_groups {
307            if let Some(entry) = read_v1_cache_entry(&mut fs, bg.start)? {
308                entries.push((bg.start, entry));
309            }
310        }
311
312        if entries.is_empty() {
313            // Still bump cache_generation so the kernel knows the v1
314            // cache is invalidated even on filesystems where it was
315            // never written.
316            if fs.superblock.cache_generation != u64::MAX {
317                let trans = Transaction::start(&mut fs)
318                    .context("failed to start transaction")?;
319                fs.superblock.cache_generation = u64::MAX;
320                trans
321                    .commit(&mut fs)
322                    .context("failed to commit transaction")?;
323                fs.sync().context("failed to sync to disk")?;
324            }
325            println!(
326                "no v1 free space cache found on {}",
327                self.device.display()
328            );
329            return Ok(());
330        }
331
332        // Pass 2: apply.
333        let mut trans = Transaction::start(&mut fs)
334            .context("failed to start transaction")?;
335
336        let mut total_extents_freed: usize = 0;
337        for (bg_start, entry) in &entries {
338            // Drop data refs first so the EXTENT_ITEMs are reclaimed
339            // when the delayed refs flush.
340            for ext in &entry.extents {
341                if ext.disk_bytenr == 0 {
342                    continue; // hole or inline; nothing to free
343                }
344                trans.delayed_refs.drop_data_ref(
345                    ext.disk_bytenr,
346                    ext.disk_num_bytes,
347                    ROOT_TREE_OBJECTID,
348                    entry.ino,
349                    ext.file_offset,
350                    1,
351                );
352                total_extents_freed += 1;
353            }
354
355            // Delete the FREE_SPACE_HEADER for this block group.
356            delete_one_item(
357                &mut trans,
358                &mut fs,
359                ROOT_TREE_OBJECTID,
360                &DiskKey {
361                    objectid: FREE_SPACE_OBJECTID,
362                    key_type: KeyType::from_raw(FREE_SPACE_HEADER_KEY_TYPE),
363                    offset: *bg_start,
364                },
365            )?;
366
367            // Delete every EXTENT_DATA item for the cache inode.
368            for ext in &entry.extents {
369                delete_one_item(
370                    &mut trans,
371                    &mut fs,
372                    ROOT_TREE_OBJECTID,
373                    &DiskKey {
374                        objectid: entry.ino,
375                        key_type: KeyType::ExtentData,
376                        offset: ext.file_offset,
377                    },
378                )?;
379            }
380
381            // Delete the INODE_ITEM (matches btrfs-progs which warns
382            // and continues if the item is missing).
383            let _ = delete_one_item(
384                &mut trans,
385                &mut fs,
386                ROOT_TREE_OBJECTID,
387                &DiskKey {
388                    objectid: entry.ino,
389                    key_type: KeyType::InodeItem,
390                    offset: 0,
391                },
392            );
393        }
394
395        // Mark the v1 cache as fully invalidated so the kernel won't
396        // try to load any leftover bits.
397        fs.superblock.cache_generation = u64::MAX;
398
399        trans
400            .commit(&mut fs)
401            .context("failed to commit transaction")?;
402        fs.sync().context("failed to sync to disk")?;
403
404        println!(
405            "cleared v1 free space cache on {} ({} block group(s), {} data extent(s) freed)",
406            self.device.display(),
407            entries.len(),
408            total_extents_freed,
409        );
410        Ok(())
411    }
412
413    fn clear_v2(&self) -> Result<()> {
414        let file = OpenOptions::new()
415            .read(true)
416            .write(true)
417            .open(&self.device)
418            .with_context(|| {
419                format!("failed to open '{}'", self.device.display())
420            })?;
421
422        let mut fs = Filesystem::open(file).with_context(|| {
423            format!("failed to open filesystem on '{}'", self.device.display())
424        })?;
425
426        let fst_flag = u64::from(raw::BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE);
427        let fst_valid_flag =
428            u64::from(raw::BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID);
429        let bgt_flag = u64::from(raw::BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE);
430
431        if fs.superblock.compat_ro_flags & bgt_flag != 0 {
432            bail!(
433                "cannot clear free space tree: filesystem has block-group-tree \
434                 enabled, which requires free-space-tree to mount"
435            );
436        }
437
438        if fs.superblock.compat_ro_flags & fst_flag == 0 {
439            println!("no free space tree to clear");
440            return Ok(());
441        }
442
443        let Some(fst_bytenr) = fs.root_bytenr(FREE_SPACE_TREE_OBJECTID) else {
444            // The compat_ro bit was set but no root pointer exists.
445            // Just clear the flags and write the superblock.
446            fs.superblock.compat_ro_flags &= !(fst_flag | fst_valid_flag);
447            let trans = Transaction::start(&mut fs)
448                .context("failed to start transaction")?;
449            trans
450                .commit(&mut fs)
451                .context("failed to commit transaction")?;
452            fs.sync().context("failed to sync to disk")?;
453            println!("cleared free space tree compat_ro flags");
454            return Ok(());
455        };
456
457        let mut tree_blocks = Vec::new();
458        collect_tree_blocks(&mut fs, fst_bytenr, &mut tree_blocks)
459            .context("failed to walk free space tree")?;
460
461        // Clear the compat_ro bits BEFORE the commit so the new
462        // superblock written at the end of commit no longer advertises
463        // an FST.
464        fs.superblock.compat_ro_flags &= !(fst_flag | fst_valid_flag);
465
466        let mut trans = Transaction::start(&mut fs)
467            .context("failed to start transaction")?;
468
469        for &(bytenr, level) in &tree_blocks {
470            trans.delayed_refs.drop_ref(
471                bytenr,
472                true,
473                FREE_SPACE_TREE_OBJECTID,
474                level,
475            );
476            trans.pin_block(bytenr);
477            fs.evict_block(bytenr);
478        }
479
480        // Delete the ROOT_ITEM for the FST from the root tree.
481        let root_key = DiskKey {
482            objectid: FREE_SPACE_TREE_OBJECTID,
483            key_type: KeyType::RootItem,
484            offset: 0,
485        };
486        let mut path = BtrfsPath::new();
487        let found = search::search_slot(
488            Some(&mut trans),
489            &mut fs,
490            ROOT_TREE_OBJECTID,
491            &root_key,
492            &mut path,
493            SearchIntent::Delete,
494            true,
495        )
496        .context("failed to search root tree for free space tree entry")?;
497
498        if found {
499            let leaf = path.nodes[0].as_mut().ok_or_else(|| {
500                anyhow::anyhow!("no leaf in path for root tree deletion")
501            })?;
502            items::del_items(leaf, path.slots[0], 1);
503            fs.mark_dirty(leaf);
504        }
505        path.release();
506
507        // Drop the FST from the in-memory roots map so the commit's
508        // update_free_space_tree pass early-returns (no FST root) and
509        // the commit doesn't try to write a ROOT_ITEM we just deleted.
510        fs.remove_root(FREE_SPACE_TREE_OBJECTID);
511
512        trans
513            .commit(&mut fs)
514            .context("failed to commit transaction")?;
515        fs.sync().context("failed to sync to disk")?;
516
517        println!(
518            "cleared free space tree on {} ({} blocks freed), kernel will rebuild it on next mount",
519            self.device.display(),
520            tree_blocks.len()
521        );
522        Ok(())
523    }
524}