Skip to main content

grit_lib/
gc.rs

1//! In-process maintenance primitives that embedders such as `jj` use in place
2//! of shelling out to `git gc` / `git remote show` / `gix::refs::transaction`.
3//!
4//! These are the remaining "replaced paths" from the jj spike (PR #9632) that
5//! are not transport-shaped:
6//!
7//! * [`prune_loose_unreachable`] — what `jj util gc` actually needs: delete the
8//!   loose objects that are not reachable from a set of roots (full repack /
9//!   `pack-refs` are explicitly out of scope).
10//! * [`remote_default_branch_local`] — the `git remote show` default-branch
11//!   lookup for a local / `file://` remote, via the remote `HEAD` symref.
12//! * [`update_refs`] — a thin, compare-and-swap, all-or-nothing batch ref
13//!   transaction over the [`crate::refs`] primitives (the in-process equivalent
14//!   of what jj built on `gix::refs::transaction`).
15//!
16//! Scope note: only the local / on-disk path is in scope here. `git://`,
17//! `http(s)`, and `ssh` remotes are out of scope and left as TODOs.
18
19use std::collections::{HashSet, VecDeque};
20use std::path::Path;
21use std::time::SystemTime;
22
23use crate::error::{Error, Result};
24use crate::objects::{parse_commit, parse_tag, parse_tree, ObjectId, ObjectKind};
25use crate::odb::Odb;
26
27/// Result of a loose-object prune.
28#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
29pub struct PruneStats {
30    /// Number of loose objects deleted.
31    pub pruned: usize,
32    /// Number of loose objects kept (reachable, or too recent to prune).
33    pub kept: usize,
34}
35
36/// Delete loose objects in `odb` that are not reachable from `reachable_roots`.
37///
38/// This is the in-process core of `jj util gc`: it walks the full reachability
39/// closure from `reachable_roots` (commits → parents and trees, trees → entries,
40/// annotated tags → their target) and then removes every **loose** object whose
41/// id is not in that closure. Packed objects are never touched — only loose
42/// object files under `objects/??/` are candidates for deletion.
43///
44/// When `keep_newer_than` is `Some(t)`, a loose object is only deleted if its
45/// file modification time is strictly older than `t`. This mirrors Git's
46/// `gc.pruneExpire` grace window: recently written objects (which may be the
47/// in-progress target of a concurrent operation) are kept even when currently
48/// unreachable. A `None` grace window prunes every unreachable loose object
49/// regardless of age.
50///
51/// Submodule (gitlink) tree entries are skipped during the walk: the commit they
52/// name lives in another object store.
53///
54/// The repository hash width is threaded through [`Odb::hash_algo`] (via the
55/// 2-char fan-out directory + suffix length), so SHA-256 repositories work.
56///
57/// # Errors
58///
59/// Returns an error if a root or a reachable object cannot be read or parsed, or
60/// on I/O failure while enumerating or deleting loose object files.
61pub fn prune_loose_unreachable(
62    odb: &Odb,
63    reachable_roots: &[ObjectId],
64    keep_newer_than: Option<SystemTime>,
65) -> Result<PruneStats> {
66    // 1. Full reachability closure from the roots.
67    let reachable = reachable_closure(odb, reachable_roots)?;
68
69    // 2. Enumerate loose objects and delete the unreachable, sufficiently-old ones.
70    let mut stats = PruneStats::default();
71    for (oid, path) in enumerate_loose_objects(odb)? {
72        if reachable.contains(&oid) {
73            stats.kept += 1;
74            continue;
75        }
76
77        // Respect the grace window: keep objects whose mtime is not strictly
78        // older than the cutoff.
79        if let Some(cutoff) = keep_newer_than {
80            let too_new = std::fs::metadata(&path)
81                .and_then(|m| m.modified())
82                .map(|mtime| mtime >= cutoff)
83                .unwrap_or(false);
84            if too_new {
85                stats.kept += 1;
86                continue;
87            }
88        }
89
90        match std::fs::remove_file(&path) {
91            Ok(()) => stats.pruned += 1,
92            // A concurrent prune may have already removed it; treat as pruned.
93            Err(e) if e.kind() == std::io::ErrorKind::NotFound => stats.pruned += 1,
94            Err(e) => return Err(Error::Io(e)),
95        }
96    }
97
98    Ok(stats)
99}
100
101/// Compute the full object closure reachable from `roots` (commits → parents and
102/// tree, trees → entries, tags → target). Mirrors the reachability walk used by
103/// the transfer pack builder, but specialized for prune (no exclusion set).
104fn reachable_closure(odb: &Odb, roots: &[ObjectId]) -> Result<HashSet<ObjectId>> {
105    let mut seen: HashSet<ObjectId> = HashSet::new();
106    let mut queue: VecDeque<ObjectId> = VecDeque::new();
107
108    for &root in roots {
109        if seen.insert(root) {
110            queue.push_back(root);
111        }
112    }
113
114    while let Some(oid) = queue.pop_front() {
115        let obj = odb.read(&oid)?;
116        match obj.kind {
117            ObjectKind::Commit => {
118                let commit = parse_commit(&obj.data)?;
119                for parent in commit.parents {
120                    if seen.insert(parent) {
121                        queue.push_back(parent);
122                    }
123                }
124                if seen.insert(commit.tree) {
125                    queue.push_back(commit.tree);
126                }
127            }
128            ObjectKind::Tree => {
129                for entry in parse_tree(&obj.data)? {
130                    // Skip submodule (gitlink) entries.
131                    if entry.mode == 0o160000 {
132                        continue;
133                    }
134                    if seen.insert(entry.oid) {
135                        queue.push_back(entry.oid);
136                    }
137                }
138            }
139            ObjectKind::Tag => {
140                let tag = parse_tag(&obj.data)?;
141                if seen.insert(tag.object) {
142                    queue.push_back(tag.object);
143                }
144            }
145            ObjectKind::Blob => {}
146        }
147    }
148
149    Ok(seen)
150}
151
152/// Enumerate the loose objects physically present in `odb`'s objects directory,
153/// returning each `(oid, path)`. Only the `??/<rest>` fan-out directories are
154/// scanned; pack files, `info/`, and any non-object entries are ignored.
155///
156/// Entries whose names do not form a valid full-length hex OID for the
157/// repository's hash algorithm are skipped (e.g. tmp files, the wrong hash
158/// width), matching Git's loose-object scan.
159fn enumerate_loose_objects(odb: &Odb) -> Result<Vec<(ObjectId, std::path::PathBuf)>> {
160    let objects_dir = odb.objects_dir();
161    let mut out = Vec::new();
162
163    let top = match std::fs::read_dir(objects_dir) {
164        Ok(rd) => rd,
165        Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(out),
166        Err(e) => return Err(Error::Io(e)),
167    };
168
169    for top_entry in top {
170        let top_entry = top_entry.map_err(Error::Io)?;
171        let name = top_entry.file_name();
172        let Some(prefix) = name.to_str() else {
173            continue;
174        };
175        // Fan-out directories are exactly two lowercase hex chars.
176        if prefix.len() != 2 || !prefix.bytes().all(|b| b.is_ascii_hexdigit()) {
177            continue;
178        }
179        if !top_entry.file_type().map_err(Error::Io)?.is_dir() {
180            continue;
181        }
182
183        let sub = match std::fs::read_dir(top_entry.path()) {
184            Ok(rd) => rd,
185            Err(e) if e.kind() == std::io::ErrorKind::NotFound => continue,
186            Err(e) => return Err(Error::Io(e)),
187        };
188        for sub_entry in sub {
189            let sub_entry = sub_entry.map_err(Error::Io)?;
190            let suffix_name = sub_entry.file_name();
191            let Some(suffix) = suffix_name.to_str() else {
192                continue;
193            };
194            if !ObjectId::is_loose_suffix_len(suffix.len())
195                || !suffix.bytes().all(|b| b.is_ascii_hexdigit())
196            {
197                continue;
198            }
199            let hex = format!("{prefix}{suffix}");
200            let Ok(oid) = ObjectId::from_hex(&hex) else {
201                continue;
202            };
203            out.push((oid, sub_entry.path()));
204        }
205    }
206
207    Ok(out)
208}
209
210/// Return the short name of a local remote's default branch (its `HEAD` symref
211/// target), e.g. `main` for a `HEAD` pointing at `refs/heads/main`.
212///
213/// This is the `git remote show <remote>` default-branch lookup for a local /
214/// `file://` remote. It reuses [`crate::ls_remote::ls_remote`]'s symref handling
215/// to read the remote `HEAD` and strips the `refs/heads/` prefix from its
216/// target. Returns `None` when the remote has no symbolic `HEAD` (e.g. a
217/// detached or absent `HEAD`).
218///
219/// # Errors
220///
221/// Returns an error if the remote git directory cannot be read.
222//
223// TODO(phase: remote transports): the `git://`, `http(s)`, and `ssh`
224// default-branch lookup (handshake `symref=HEAD:`) is out of scope here.
225pub fn remote_default_branch_local(remote_git_dir: &Path) -> Result<Option<String>> {
226    let remote_odb =
227        Odb::new(&remote_git_dir.join("objects")).with_config_git_dir(remote_git_dir.to_path_buf());
228
229    let entries = crate::ls_remote::ls_remote(
230        remote_git_dir,
231        &remote_odb,
232        &crate::ls_remote::Options {
233            symref: true,
234            ..Default::default()
235        },
236    )?;
237
238    for entry in &entries {
239        if entry.name == "HEAD" {
240            return Ok(entry
241                .symref_target
242                .as_ref()
243                .map(|t| t.strip_prefix("refs/heads/").unwrap_or(t).to_owned()));
244        }
245    }
246
247    Ok(None)
248}
249
250/// A single ref change in an [`update_refs`] batch transaction.
251#[derive(Clone, Debug)]
252pub struct RefTransactionItem {
253    /// Full ref name (e.g. `refs/heads/main`).
254    pub name: String,
255    /// New value to write, or `None` to delete the ref.
256    pub new_oid: Option<ObjectId>,
257    /// Compare-and-swap expectation. When `Some`, the ref's current value must
258    /// equal this for the item to apply (an `expected_old` of an oid that is not
259    /// the current value — including when the ref is absent — fails the batch).
260    /// When `None`, the current value is not checked.
261    ///
262    /// Note: this CAS form expects the ref to currently hold `expected_old`. To
263    /// require that a ref be *created* (must not already exist), leave this
264    /// `None` — callers that need create-only semantics check existence
265    /// themselves; matching jj's transaction model where `None` means "any".
266    pub expected_old: Option<ObjectId>,
267}
268
269/// Apply a batch of ref create/update/delete operations transactionally with
270/// compare-and-swap semantics.
271///
272/// Every item whose `expected_old` is `Some` is checked against the ref's
273/// current value first; if **any** CAS check fails, the entire batch is rejected
274/// and **nothing** is written (all-or-nothing). Only once all CAS checks pass
275/// are the changes applied:
276///
277/// * `new_oid = Some(oid)` writes (creates or updates) the ref to `oid` via
278///   [`crate::refs::write_ref`].
279/// * `new_oid = None` deletes the ref via [`crate::refs::delete_ref`].
280///
281/// This is a thin transactional wrapper over the [`crate::refs`] primitives,
282/// matching the in-process ref-update path jj built on `gix::refs::transaction`.
283///
284/// # Errors
285///
286/// Returns [`Error::Message`] describing the first failing CAS check (before any
287/// mutation), or an I/O / ref error if applying a change fails after the checks
288/// passed. The CAS pre-check makes a clean rejection the common failure mode;
289/// an apply-time error can still leave a partially-applied batch (the
290/// pre-checked, conflict-free case), which is the same guarantee Git's files
291/// backend gives outside of `core.refTransaction` hooks.
292pub fn update_refs(git_dir: &Path, updates: &[RefTransactionItem]) -> Result<()> {
293    // Phase 1: verify every CAS expectation against current state. Apply nothing
294    // if any check fails.
295    for item in updates {
296        if let Some(expected) = item.expected_old {
297            let current = crate::refs::resolve_ref(git_dir, &item.name).ok();
298            if current != Some(expected) {
299                return Err(Error::Message(format!(
300                    "ref transaction rejected: '{}' expected {} but found {}",
301                    item.name,
302                    expected,
303                    current
304                        .map(|o| o.to_hex())
305                        .unwrap_or_else(|| "<absent>".to_owned()),
306                )));
307            }
308        }
309    }
310
311    // Phase 2: apply. All CAS checks passed, so conflicts are not expected.
312    for item in updates {
313        match &item.new_oid {
314            Some(oid) => crate::refs::write_ref(git_dir, &item.name, oid)?,
315            None => crate::refs::delete_ref(git_dir, &item.name)?,
316        }
317    }
318
319    Ok(())
320}