Skip to main content

git_stats/
repo.rs

1//! Infrastructure: the only module that talks to gix and the filesystem.
2//!
3//! [`Repo`] has two constructors. [`Repo::open`] reads a real repository;
4//! [`Repo::create_null`] returns an in-memory stand-in driven by canned commits,
5//! so application and logic code can be tested without touching git.
6
7use rayon::prelude::*;
8
9use crate::error::{Error, Result};
10use crate::model::{Author, CommitMeta, DiffStat, Trailer};
11
12/// A commit yielded by [`Repo::walk`]: its logic-facing metadata plus an opaque
13/// handle the wrapper uses to compute the numstat later.
14pub struct WalkedCommit {
15    pub meta: CommitMeta,
16    is_merge: bool,
17    handle: Handle,
18}
19
20/// How to obtain a commit's numstat: from gix (real) or from a canned value (null).
21enum Handle {
22    Real(gix::ObjectId),
23    Null(DiffStat),
24}
25
26/// A canned commit used to build a nulled [`Repo`] in tests.
27#[derive(Debug, Clone)]
28pub struct NulledCommit {
29    pub meta: CommitMeta,
30    pub diff: DiffStat,
31    pub is_merge: bool,
32}
33
34enum Backend {
35    // Boxed because a `ThreadSafeRepository` is far larger than the null variant.
36    Real(Box<gix::ThreadSafeRepository>),
37    Null(Vec<NulledCommit>),
38}
39
40/// A git repository, or a nulled stand-in for it.
41pub struct Repo {
42    backend: Backend,
43}
44
45impl Repo {
46    /// Open the repository containing `path`. Like git, this honors `GIT_DIR`
47    /// and related environment variables first (git sets them when running
48    /// hooks and for `--git-dir` invocations), then searches upward from
49    /// `path`.
50    ///
51    /// # Errors
52    ///
53    /// Returns an error if no git repository can be discovered from the
54    /// environment or `path`.
55    pub fn open(path: impl AsRef<std::path::Path>) -> Result<Self> {
56        let repo = gix::ThreadSafeRepository::discover_with_environment_overrides(path)
57            .map_err(|e| Error::OpenRepository(Box::new(e)))?;
58        Ok(Self {
59            backend: Backend::Real(Box::new(repo)),
60        })
61    }
62
63    /// Build a nulled repository whose [`walk`](Self::walk) returns the given
64    /// commits verbatim. The revision range is ignored, since range resolution
65    /// is real-git behavior exercised separately by integration tests.
66    #[must_use]
67    pub fn create_null(commits: Vec<NulledCommit>) -> Self {
68        Self {
69            backend: Backend::Null(commits),
70        }
71    }
72
73    /// Whether the repository is a shallow clone. Its truncated history makes
74    /// every count differ from the full clone's, so callers may want to warn.
75    /// Nulled repositories never are.
76    #[must_use]
77    pub fn is_shallow(&self) -> bool {
78        match &self.backend {
79            Backend::Real(tsr) => tsr.to_thread_local().is_shallow(),
80            Backend::Null(_) => false,
81        }
82    }
83
84    /// Read commit metadata for every commit in `range`. Trailers are parsed
85    /// only when `need_trailers` is set, since they feed only the reviews table
86    /// and decoding every commit message is costly on large histories.
87    ///
88    /// # Errors
89    ///
90    /// Returns an error if the range cannot be resolved or a commit object
91    /// cannot be read.
92    pub fn walk(&self, range: &str, need_trailers: bool) -> Result<Vec<WalkedCommit>> {
93        match &self.backend {
94            Backend::Real(tsr) => {
95                let mut repo = tsr.to_thread_local();
96                // Match `numstats`: an object cache lets the metadata read reuse
97                // commits the traversal already decoded instead of re-reading packs.
98                repo.object_cache_size_if_unset(8 * 1024 * 1024);
99                walk_real(&repo, range, need_trailers)
100            }
101            Backend::Null(commits) => Ok(commits
102                .iter()
103                .map(|c| WalkedCommit {
104                    meta: c.meta.clone(),
105                    is_merge: c.is_merge,
106                    handle: Handle::Null(c.diff),
107                })
108                .collect()),
109        }
110    }
111
112    /// Compute the numstat for each given commit. Merge commits contribute
113    /// nothing, matching `git log --numstat`'s default.
114    ///
115    /// # Errors
116    ///
117    /// Returns an error if a commit's tree or its parent's tree cannot be
118    /// diffed.
119    pub fn numstats(&self, commits: &[&WalkedCommit]) -> Result<Vec<DiffStat>> {
120        match &self.backend {
121            Backend::Real(tsr) => commits
122                .par_iter()
123                .map_init(
124                    || Worker::new(tsr.to_thread_local()),
125                    |worker, c| worker.numstat(c),
126                )
127                .collect(),
128            Backend::Null(_) => Ok(commits.iter().map(|c| null_diffstat(c)).collect()),
129        }
130    }
131}
132
133/// Per-rayon-worker state for the real backend: a thread-local repository and
134/// two diff resource caches built once and reused for every commit this worker
135/// handles. `gix::Tree::stats` would instead build a fresh cache per call, which
136/// re-parses the on-disk index and reassembles the attribute stack each time; on
137/// a large history that rebuild, not the diff itself, dominates the runtime.
138struct Worker {
139    repo: gix::Repository,
140    // Commits at the shallow boundary, whose parents exist in their headers
141    // but not in the object database. `None` when the clone is not shallow.
142    shallow: Option<gix::shallow::Commits>,
143    // `for_each_to_obtain_tree_with_cache` holds one cache for the tree walk
144    // while each change's line diff needs another, so a worker keeps two. They
145    // are built lazily because the cache constructor is fallible and rayon's
146    // `map_init` initializer cannot return a `Result`.
147    caches: Option<(gix::diff::blob::Platform, gix::diff::blob::Platform)>,
148}
149
150impl Worker {
151    fn new(mut repo: gix::Repository) -> Self {
152        // Per-thread object cache so repeated tree/blob reads during diffing hit
153        // memory instead of the pack. 8 MiB comfortably holds a commit's trees.
154        repo.object_cache_size_if_unset(8 * 1024 * 1024);
155        // An unreadable shallow file degrades to "not shallow"; a boundary
156        // commit then surfaces its missing parent as a read error below.
157        let shallow = repo.shallow_commits().ok().flatten();
158        Self {
159            repo,
160            shallow,
161            caches: None,
162        }
163    }
164
165    fn numstat(&mut self, commit: &WalkedCommit) -> Result<DiffStat> {
166        if commit.is_merge {
167            return Ok(DiffStat::default());
168        }
169        let id = match &commit.handle {
170            Handle::Real(id) => *id,
171            // A real backend never yields a null handle; nothing to diff.
172            Handle::Null(_) => return Ok(DiffStat::default()),
173        };
174        if self.caches.is_none() {
175            let walk = self
176                .repo
177                .diff_resource_cache_for_tree_diff()
178                .map_err(|e| Error::DiffStats(Box::new(e)))?;
179            let count = self
180                .repo
181                .diff_resource_cache_for_tree_diff()
182                .map_err(|e| Error::DiffStats(Box::new(e)))?;
183            self.caches = Some((walk, count));
184        }
185        let Self {
186            repo,
187            shallow,
188            caches,
189        } = self;
190        let (walk_cache, count_cache) = caches.as_mut().expect("initialized above");
191        numstat_real(repo, walk_cache, count_cache, shallow.as_ref(), id)
192    }
193}
194
195/// A canned (null-backend) commit's numstat: its stored value, or nothing for a
196/// merge. Real handles never reach the null backend.
197fn null_diffstat(commit: &WalkedCommit) -> DiffStat {
198    if commit.is_merge {
199        return DiffStat::default();
200    }
201    match &commit.handle {
202        Handle::Null(diff) => *diff,
203        Handle::Real(_) => DiffStat::default(),
204    }
205}
206
207fn walk_real(
208    repo: &gix::Repository,
209    range: &str,
210    need_trailers: bool,
211) -> Result<Vec<WalkedCommit>> {
212    let (tips, hidden) = resolve_range(repo, range)?;
213    let mailmap = repo.open_mailmap();
214    // git grafts commits at the shallow boundary as parentless, so one whose
215    // header names several parents is not a merge there. An unreadable shallow
216    // file degrades to "not shallow", mirroring Worker::new.
217    let shallow = repo.shallow_commits().ok().flatten();
218    let walk = repo
219        .rev_walk(tips)
220        .with_hidden(hidden)
221        .all()
222        .map_err(|e| Error::WalkRange {
223            range: range.to_string(),
224            source: Box::new(e),
225        })?;
226    let mut out = Vec::new();
227    for info in walk {
228        let info = info.map_err(|e| Error::ReadCommit(Box::new(e)))?;
229        let commit = repo
230            .find_commit(info.id)
231            .map_err(|e| Error::ReadCommit(Box::new(e)))?;
232        // Decode the header once; the author, committer, parents, and message
233        // accessors on `gix::Commit` would each rescan the raw bytes.
234        let commit = commit.decode().map_err(|e| Error::DecodeCommit {
235            id: info.id.to_string(),
236            source: Box::new(e),
237        })?;
238        let is_boundary = shallow
239            .as_ref()
240            .is_some_and(|s| s.binary_search(&info.id).is_ok());
241        let is_merge = !is_boundary && commit.parents.len() > 1;
242        out.push(WalkedCommit {
243            meta: commit_meta(&commit, &mailmap, need_trailers)?,
244            is_merge,
245            handle: Handle::Real(info.id),
246        });
247    }
248    Ok(out)
249}
250
251fn commit_meta(
252    commit: &gix::objs::CommitRef<'_>,
253    mailmap: &gix::mailmap::Snapshot,
254    need_trailers: bool,
255) -> Result<CommitMeta> {
256    let author = mailmap.resolve(
257        commit
258            .author()
259            .map_err(|e| Error::ReadCommit(Box::new(e)))?,
260    );
261    let time_seconds = commit
262        .committer()
263        .map_err(|e| Error::ReadCommit(Box::new(e)))?
264        .seconds();
265    let trailers = if need_trailers {
266        parse_trailers(commit)
267    } else {
268        Vec::new()
269    };
270    Ok(CommitMeta {
271        author: Author {
272            name: author.name.to_string(),
273            email: author.email.to_string(),
274        },
275        time_seconds,
276        trailers,
277    })
278}
279
280fn parse_trailers(commit: &gix::objs::CommitRef<'_>) -> Vec<Trailer> {
281    let Some(body) = commit.message().body() else {
282        return Vec::new();
283    };
284    body.trailers()
285        .map(|t| Trailer {
286            token: t.token.to_string(),
287            value: t.value.to_string(),
288        })
289        .collect()
290}
291
292fn numstat_real(
293    repo: &gix::Repository,
294    walk_cache: &mut gix::diff::blob::Platform,
295    count_cache: &mut gix::diff::blob::Platform,
296    shallow: Option<&gix::shallow::Commits>,
297    id: gix::ObjectId,
298) -> Result<DiffStat> {
299    let commit = repo
300        .find_commit(id)
301        .map_err(|e| Error::ReadCommit(Box::new(e)))?;
302    let new_tree = commit.tree().map_err(|e| Error::ReadCommit(Box::new(e)))?;
303    // A commit at the shallow boundary names parents that are not in the
304    // object database. git diffs it against the empty tree, exactly like a
305    // root commit, so do the same instead of failing on the missing parent.
306    let is_boundary = shallow.is_some_and(|s| s.binary_search(&id).is_ok());
307    let parent = if is_boundary {
308        None
309    } else {
310        commit.parent_ids().next()
311    };
312    let old_tree = match parent {
313        Some(parent) => repo
314            .find_commit(parent.detach())
315            .map_err(|e| Error::ReadCommit(Box::new(e)))?
316            .tree()
317            .map_err(|e| Error::ReadCommit(Box::new(e)))?,
318        None => repo.empty_tree(),
319    };
320
321    // Equivalent to `gix::Tree::stats`, but driving caller-owned caches so the
322    // index and attribute stack are read once per worker rather than per commit.
323    let (mut files, mut insertions, mut deletions) = (0u64, 0u64, 0u64);
324    old_tree
325        .changes()
326        .map_err(|e| Error::DiffStats(Box::new(e)))?
327        .for_each_to_obtain_tree_with_cache(&new_tree, walk_cache, |change| {
328            if let Some((ins, del)) = gitlink_lines(&change) {
329                files += 1;
330                insertions += ins;
331                deletions += del;
332            } else {
333                match change
334                    .diff(count_cache)
335                    .ok()
336                    .and_then(|mut platform| platform.line_counts().ok())
337                {
338                    Some(Some(counts)) => {
339                        files += 1;
340                        insertions += u64::from(counts.insertions);
341                        deletions += u64::from(counts.removals);
342                    }
343                    // A binary change has no line counts (numstat's `-  -  path`)
344                    // but still counts as a changed file, as in `git diff --shortstat`.
345                    Some(None) => files += 1,
346                    None => {}
347                }
348            }
349            // The resource cache only grows; clear it between changes to bound memory.
350            count_cache.clear_resource_cache_keep_allocation();
351            Ok::<_, std::convert::Infallible>(std::ops::ControlFlow::Continue(()))
352        })
353        .map_err(|e| Error::DiffStats(Box::new(e)))?;
354    walk_cache.clear_resource_cache_keep_allocation();
355
356    Ok(DiffStat {
357        insertions,
358        deletions,
359        files,
360    })
361}
362
363/// The numstat of a submodule (gitlink) change, which carries no blob to diff.
364/// git renders the pointer as a one-line `Subproject commit <hash>` pseudo-file,
365/// so an added gitlink is +1, a removed one -1, and a repointed one +1/-1.
366/// Returns `None` for changes not purely between gitlinks; a blob<->gitlink
367/// type change falls through to the regular blob diff.
368fn gitlink_lines(change: &gix::object::tree::diff::Change<'_, '_, '_>) -> Option<(u64, u64)> {
369    use gix::object::tree::diff::Change;
370    match *change {
371        Change::Addition { entry_mode, .. } if entry_mode.is_commit() => Some((1, 0)),
372        Change::Deletion { entry_mode, .. } if entry_mode.is_commit() => Some((0, 1)),
373        Change::Modification {
374            previous_entry_mode,
375            entry_mode,
376            ..
377        } if previous_entry_mode.is_commit() && entry_mode.is_commit() => Some((1, 1)),
378        _ => None,
379    }
380}
381
382/// Tips to walk from, and commits to hide, for a revision range.
383type RangeEnds = (Vec<gix::ObjectId>, Vec<gix::ObjectId>);
384
385/// Resolve a revision range into walk tips and hidden commits. Each endpoint is
386/// fully git-spelled (refs, short hashes, `@{n}`, ...); only the `..`/`...`
387/// operators are interpreted here. Exotic gitrevisions(7) forms are unsupported.
388fn resolve_range(repo: &gix::Repository, range: &str) -> Result<RangeEnds> {
389    if let Some((a, b)) = range.split_once("...") {
390        let a = single(repo, default_head(a))?;
391        let b = single(repo, default_head(b))?;
392        let hidden = merge_bases(repo, a, b, range)?;
393        return Ok((vec![a, b], hidden));
394    }
395    if let Some((a, b)) = range.split_once("..") {
396        let excluded = single(repo, default_head(a))?;
397        let included = single(repo, default_head(b))?;
398        return Ok((vec![included], vec![excluded]));
399    }
400    Ok((vec![single(repo, range)?], Vec::new()))
401}
402
403fn default_head(rev: &str) -> &str {
404    if rev.is_empty() { "HEAD" } else { rev }
405}
406
407fn single(repo: &gix::Repository, rev: &str) -> Result<gix::ObjectId> {
408    let err = |source: Box<dyn std::error::Error + Send + Sync>| Error::ResolveRevision {
409        revision: rev.to_string(),
410        source,
411    };
412    // git peels tags recursively at rev-list endpoints, so an annotated tag
413    // must resolve to its target commit here. Without peeling, a tag OID on
414    // the hidden side matches nothing during the walk and the range silently
415    // degrades to whole-repo history.
416    Ok(repo
417        .rev_parse_single(rev)
418        .map_err(|e| err(Box::new(e)))?
419        .object()
420        .map_err(|e| err(Box::new(e)))?
421        .peel_to_commit()
422        .map_err(|e| err(Box::new(e)))?
423        .id)
424}
425
426/// Every merge base of `a` and `b`, hidden from the walk to form the symmetric
427/// difference. Criss-cross histories have several, and git hides them all;
428/// hiding only the "best" one leaks the other bases' ancestries into the walk.
429/// Disjoint histories have none; then nothing is hidden and the symmetric
430/// difference is simply everything reachable from either tip.
431fn merge_bases(
432    repo: &gix::Repository,
433    a: gix::ObjectId,
434    b: gix::ObjectId,
435    range: &str,
436) -> Result<Vec<gix::ObjectId>> {
437    let bases = repo
438        .merge_bases_many(a, &[b])
439        .map_err(|e| Error::WalkRange {
440            range: range.to_string(),
441            source: Box::new(e),
442        })?;
443    Ok(bases.into_iter().map(gix::Id::detach).collect())
444}
445
446/// Parse a `--since`/`--until` style date into seconds since the Unix epoch.
447/// Accepts ISO 8601, RFC 2822, unix timestamps, and relative spellings like
448/// "2 weeks ago" (a documented subset of git's approxidate).
449///
450/// # Errors
451///
452/// Returns an error if the input is not a recognized date format.
453pub fn parse_date(input: Option<&str>) -> Result<Option<i64>> {
454    let Some(s) = input else { return Ok(None) };
455    let now = std::time::SystemTime::now();
456    let time = gix::date::parse(s, Some(now)).map_err(|e| Error::InvalidDate {
457        input: s.to_string(),
458        message: e.to_string(),
459    })?;
460    Ok(Some(time.seconds))
461}
462
463#[cfg(test)]
464mod tests {
465    use super::parse_date;
466
467    #[test]
468    fn parse_date_returns_none_for_no_input() {
469        assert_eq!(parse_date(None).unwrap(), None);
470    }
471
472    #[test]
473    fn parse_date_accepts_an_iso_date() {
474        assert!(parse_date(Some("2020-01-01")).unwrap().is_some());
475    }
476
477    #[test]
478    fn parse_date_rejects_unparseable_input() {
479        assert!(parse_date(Some("not-a-real-date")).is_err());
480    }
481}