Skip to main content

git_stats/
repo.rs

1//! Infrastructure: the only module that talks to gix and the filesystem.
2//!
3//! [`Repo`] has two constructors. [`Repo::open`] reads a real repository;
4//! [`Repo::create_null`] returns an in-memory stand-in driven by canned commits,
5//! so application and logic code can be tested without touching git.
6
7use rayon::prelude::*;
8
9use crate::error::{Error, Result};
10use crate::model::{Author, CommitMeta, DiffStat, Trailer};
11
12/// A commit yielded by [`Repo::walk`]: its logic-facing metadata plus an opaque
13/// handle the wrapper uses to compute the numstat later.
14pub struct WalkedCommit {
15    pub meta: CommitMeta,
16    is_merge: bool,
17    handle: Handle,
18}
19
20/// How to obtain a commit's numstat: from gix (real) or from a canned value (null).
21enum Handle {
22    Real(gix::ObjectId),
23    Null(DiffStat),
24}
25
26/// A canned commit used to build a nulled [`Repo`] in tests.
27#[derive(Debug, Clone)]
28pub struct NulledCommit {
29    pub meta: CommitMeta,
30    pub diff: DiffStat,
31    pub is_merge: bool,
32}
33
34enum Backend {
35    // Boxed because a `ThreadSafeRepository` is far larger than the null variant.
36    Real(Box<gix::ThreadSafeRepository>),
37    Null(Vec<NulledCommit>),
38}
39
40/// A git repository, or a nulled stand-in for it.
41pub struct Repo {
42    backend: Backend,
43}
44
45impl Repo {
46    /// Open the repository containing `path`, searching upward like git does.
47    ///
48    /// # Errors
49    ///
50    /// Returns an error if no git repository can be discovered from `path`.
51    pub fn open(path: impl AsRef<std::path::Path>) -> Result<Self> {
52        let repo = gix::discover(path).map_err(|e| Error::OpenRepository(Box::new(e)))?;
53        Ok(Self {
54            backend: Backend::Real(Box::new(repo.into_sync())),
55        })
56    }
57
58    /// Build a nulled repository whose [`walk`](Self::walk) returns the given
59    /// commits verbatim. The revision range is ignored, since range resolution
60    /// is real-git behavior exercised separately by integration tests.
61    #[must_use]
62    pub fn create_null(commits: Vec<NulledCommit>) -> Self {
63        Self {
64            backend: Backend::Null(commits),
65        }
66    }
67
68    /// Read commit metadata for every commit in `range`. Trailers are parsed
69    /// only when `need_trailers` is set, since they feed only the reviews table
70    /// and decoding every commit message is costly on large histories.
71    ///
72    /// # Errors
73    ///
74    /// Returns an error if the range cannot be resolved or a commit object
75    /// cannot be read.
76    pub fn walk(&self, range: &str, need_trailers: bool) -> Result<Vec<WalkedCommit>> {
77        match &self.backend {
78            Backend::Real(tsr) => {
79                let mut repo = tsr.to_thread_local();
80                // Match `numstats`: an object cache lets the metadata read reuse
81                // commits the traversal already decoded instead of re-reading packs.
82                repo.object_cache_size_if_unset(8 * 1024 * 1024);
83                walk_real(&repo, range, need_trailers)
84            }
85            Backend::Null(commits) => Ok(commits
86                .iter()
87                .map(|c| WalkedCommit {
88                    meta: c.meta.clone(),
89                    is_merge: c.is_merge,
90                    handle: Handle::Null(c.diff),
91                })
92                .collect()),
93        }
94    }
95
96    /// Compute the numstat for each given commit. Merge commits contribute
97    /// nothing, matching `git log --numstat`'s default.
98    ///
99    /// # Errors
100    ///
101    /// Returns an error if a commit's tree or its parent's tree cannot be
102    /// diffed.
103    pub fn numstats(&self, commits: &[&WalkedCommit]) -> Result<Vec<DiffStat>> {
104        match &self.backend {
105            Backend::Real(tsr) => commits
106                .par_iter()
107                .map_init(
108                    || Worker::new(tsr.to_thread_local()),
109                    |worker, c| worker.numstat(c),
110                )
111                .collect(),
112            Backend::Null(_) => Ok(commits.iter().map(|c| null_diffstat(c)).collect()),
113        }
114    }
115}
116
117/// Per-rayon-worker state for the real backend: a thread-local repository and
118/// two diff resource caches built once and reused for every commit this worker
119/// handles. `gix::Tree::stats` would instead build a fresh cache per call, which
120/// re-parses the on-disk index and reassembles the attribute stack each time; on
121/// a large history that rebuild, not the diff itself, dominates the runtime.
122struct Worker {
123    repo: gix::Repository,
124    // `for_each_to_obtain_tree_with_cache` holds one cache for the tree walk
125    // while each change's line diff needs another, so a worker keeps two. They
126    // are built lazily because the cache constructor is fallible and rayon's
127    // `map_init` initializer cannot return a `Result`.
128    caches: Option<(gix::diff::blob::Platform, gix::diff::blob::Platform)>,
129}
130
131impl Worker {
132    fn new(mut repo: gix::Repository) -> Self {
133        // Per-thread object cache so repeated tree/blob reads during diffing hit
134        // memory instead of the pack. 8 MiB comfortably holds a commit's trees.
135        repo.object_cache_size_if_unset(8 * 1024 * 1024);
136        Self { repo, caches: None }
137    }
138
139    fn numstat(&mut self, commit: &WalkedCommit) -> Result<DiffStat> {
140        if commit.is_merge {
141            return Ok(DiffStat::default());
142        }
143        let id = match &commit.handle {
144            Handle::Real(id) => *id,
145            // A real backend never yields a null handle; nothing to diff.
146            Handle::Null(_) => return Ok(DiffStat::default()),
147        };
148        if self.caches.is_none() {
149            let walk = self
150                .repo
151                .diff_resource_cache_for_tree_diff()
152                .map_err(|e| Error::DiffStats(Box::new(e)))?;
153            let count = self
154                .repo
155                .diff_resource_cache_for_tree_diff()
156                .map_err(|e| Error::DiffStats(Box::new(e)))?;
157            self.caches = Some((walk, count));
158        }
159        let Self { repo, caches } = self;
160        let (walk_cache, count_cache) = caches.as_mut().expect("initialized above");
161        numstat_real(repo, walk_cache, count_cache, id)
162    }
163}
164
165/// A canned (null-backend) commit's numstat: its stored value, or nothing for a
166/// merge. Real handles never reach the null backend.
167fn null_diffstat(commit: &WalkedCommit) -> DiffStat {
168    if commit.is_merge {
169        return DiffStat::default();
170    }
171    match &commit.handle {
172        Handle::Null(diff) => *diff,
173        Handle::Real(_) => DiffStat::default(),
174    }
175}
176
177fn walk_real(
178    repo: &gix::Repository,
179    range: &str,
180    need_trailers: bool,
181) -> Result<Vec<WalkedCommit>> {
182    let (tips, hidden) = resolve_range(repo, range)?;
183    let mailmap = repo.open_mailmap();
184    let walk = repo
185        .rev_walk(tips)
186        .with_hidden(hidden)
187        .all()
188        .map_err(|e| Error::WalkRange {
189            range: range.to_string(),
190            source: Box::new(e),
191        })?;
192    let mut out = Vec::new();
193    for info in walk {
194        let info = info.map_err(|e| Error::ReadCommit(Box::new(e)))?;
195        let commit = repo
196            .find_commit(info.id)
197            .map_err(|e| Error::ReadCommit(Box::new(e)))?;
198        let is_merge = commit.parent_ids().take(2).count() > 1;
199        out.push(WalkedCommit {
200            meta: commit_meta(&commit, &mailmap, need_trailers)?,
201            is_merge,
202            handle: Handle::Real(info.id),
203        });
204    }
205    Ok(out)
206}
207
208fn commit_meta(
209    commit: &gix::Commit,
210    mailmap: &gix::mailmap::Snapshot,
211    need_trailers: bool,
212) -> Result<CommitMeta> {
213    let author = mailmap.resolve(
214        commit
215            .author()
216            .map_err(|e| Error::ReadCommit(Box::new(e)))?,
217    );
218    let time_seconds = commit
219        .committer()
220        .map_err(|e| Error::ReadCommit(Box::new(e)))?
221        .seconds();
222    let trailers = if need_trailers {
223        parse_trailers(commit)?
224    } else {
225        Vec::new()
226    };
227    Ok(CommitMeta {
228        author: Author {
229            name: author.name.to_string(),
230            email: author.email.to_string(),
231        },
232        time_seconds,
233        trailers,
234    })
235}
236
237fn parse_trailers(commit: &gix::Commit) -> Result<Vec<Trailer>> {
238    let message = commit
239        .message()
240        .map_err(|e| Error::ReadCommit(Box::new(e)))?;
241    let Some(body) = message.body() else {
242        return Ok(Vec::new());
243    };
244    Ok(body
245        .trailers()
246        .map(|t| Trailer {
247            token: t.token.to_string(),
248            value: t.value.to_string(),
249        })
250        .collect())
251}
252
253fn numstat_real(
254    repo: &gix::Repository,
255    walk_cache: &mut gix::diff::blob::Platform,
256    count_cache: &mut gix::diff::blob::Platform,
257    id: gix::ObjectId,
258) -> Result<DiffStat> {
259    let commit = repo
260        .find_commit(id)
261        .map_err(|e| Error::ReadCommit(Box::new(e)))?;
262    let new_tree = commit.tree().map_err(|e| Error::ReadCommit(Box::new(e)))?;
263    let old_tree = match commit.parent_ids().next() {
264        Some(parent) => repo
265            .find_commit(parent.detach())
266            .map_err(|e| Error::ReadCommit(Box::new(e)))?
267            .tree()
268            .map_err(|e| Error::ReadCommit(Box::new(e)))?,
269        None => repo.empty_tree(),
270    };
271
272    // Equivalent to `gix::Tree::stats`, but driving caller-owned caches so the
273    // index and attribute stack are read once per worker rather than per commit.
274    let (mut files, mut insertions, mut deletions) = (0u64, 0u64, 0u64);
275    old_tree
276        .changes()
277        .map_err(|e| Error::DiffStats(Box::new(e)))?
278        .for_each_to_obtain_tree_with_cache(&new_tree, walk_cache, |change| {
279            if let Some(counts) = change
280                .diff(count_cache)
281                .ok()
282                .and_then(|mut platform| platform.line_counts().ok())
283                .flatten()
284            {
285                files += 1;
286                insertions += u64::from(counts.insertions);
287                deletions += u64::from(counts.removals);
288            }
289            // The resource cache only grows; clear it between changes to bound memory.
290            count_cache.clear_resource_cache_keep_allocation();
291            Ok::<_, std::convert::Infallible>(std::ops::ControlFlow::Continue(()))
292        })
293        .map_err(|e| Error::DiffStats(Box::new(e)))?;
294    walk_cache.clear_resource_cache_keep_allocation();
295
296    Ok(DiffStat {
297        insertions,
298        deletions,
299        files,
300    })
301}
302
303/// Tips to walk from, and commits to hide, for a revision range.
304type RangeEnds = (Vec<gix::ObjectId>, Vec<gix::ObjectId>);
305
306/// Resolve a revision range into walk tips and hidden commits. Each endpoint is
307/// fully git-spelled (refs, short hashes, `@{n}`, ...); only the `..`/`...`
308/// operators are interpreted here. Exotic gitrevisions(7) forms are unsupported.
309fn resolve_range(repo: &gix::Repository, range: &str) -> Result<RangeEnds> {
310    if let Some((a, b)) = range.split_once("...") {
311        let a = single(repo, default_head(a))?;
312        let b = single(repo, default_head(b))?;
313        return Ok((vec![a, b], merge_bases(repo, a, b)));
314    }
315    if let Some((a, b)) = range.split_once("..") {
316        let excluded = single(repo, default_head(a))?;
317        let included = single(repo, default_head(b))?;
318        return Ok((vec![included], vec![excluded]));
319    }
320    Ok((vec![single(repo, range)?], Vec::new()))
321}
322
323fn default_head(rev: &str) -> &str {
324    if rev.is_empty() { "HEAD" } else { rev }
325}
326
327fn single(repo: &gix::Repository, rev: &str) -> Result<gix::ObjectId> {
328    let err = |source: Box<dyn std::error::Error + Send + Sync>| Error::ResolveRevision {
329        revision: rev.to_string(),
330        source,
331    };
332    // git peels tags recursively at rev-list endpoints, so an annotated tag
333    // must resolve to its target commit here. Without peeling, a tag OID on
334    // the hidden side matches nothing during the walk and the range silently
335    // degrades to whole-repo history.
336    Ok(repo
337        .rev_parse_single(rev)
338        .map_err(|e| err(Box::new(e)))?
339        .object()
340        .map_err(|e| err(Box::new(e)))?
341        .peel_to_commit()
342        .map_err(|e| err(Box::new(e)))?
343        .id)
344}
345
346fn merge_bases(repo: &gix::Repository, a: gix::ObjectId, b: gix::ObjectId) -> Vec<gix::ObjectId> {
347    // Disjoint histories have no merge base; then nothing is hidden and the
348    // symmetric difference is simply everything reachable from either tip.
349    match repo.merge_base(a, b) {
350        Ok(base) => vec![base.detach()],
351        Err(_) => Vec::new(),
352    }
353}
354
355/// Parse a `--since`/`--until` style date into seconds since the Unix epoch.
356/// Accepts ISO 8601, RFC 2822, unix timestamps, and relative spellings like
357/// "2 weeks ago" (a documented subset of git's approxidate).
358///
359/// # Errors
360///
361/// Returns an error if the input is not a recognized date format.
362pub fn parse_date(input: Option<&str>) -> Result<Option<i64>> {
363    let Some(s) = input else { return Ok(None) };
364    let now = std::time::SystemTime::now();
365    let time = gix::date::parse(s, Some(now)).map_err(|e| Error::InvalidDate {
366        input: s.to_string(),
367        message: e.to_string(),
368    })?;
369    Ok(Some(time.seconds))
370}
371
372#[cfg(test)]
373mod tests {
374    use super::parse_date;
375
376    #[test]
377    fn parse_date_returns_none_for_no_input() {
378        assert_eq!(parse_date(None).unwrap(), None);
379    }
380
381    #[test]
382    fn parse_date_accepts_an_iso_date() {
383        assert!(parse_date(Some("2020-01-01")).unwrap().is_some());
384    }
385
386    #[test]
387    fn parse_date_rejects_unparseable_input() {
388        assert!(parse_date(Some("not-a-real-date")).is_err());
389    }
390}