Skip to main content

git_stats/
repo.rs

1//! Infrastructure: the only module that talks to gix and the filesystem.
2//!
3//! [`Repo`] has two constructors. [`Repo::open`] reads a real repository;
4//! [`Repo::create_null`] returns an in-memory stand-in driven by canned commits,
5//! so application and logic code can be tested without touching git.
6
7use rayon::prelude::*;
8
9use crate::error::{Error, Result};
10use crate::model::{Author, CommitMeta, DiffStat, Trailer};
11
12/// A commit yielded by [`Repo::walk`]: its logic-facing metadata plus an opaque
13/// handle the wrapper uses to compute the numstat later.
14pub struct WalkedCommit {
15    pub meta: CommitMeta,
16    is_merge: bool,
17    handle: Handle,
18}
19
20/// How to obtain a commit's numstat: from gix (real) or from a canned value (null).
21enum Handle {
22    Real(gix::ObjectId),
23    Null(DiffStat),
24}
25
26/// A canned commit used to build a nulled [`Repo`] in tests.
27#[derive(Debug, Clone)]
28pub struct NulledCommit {
29    pub meta: CommitMeta,
30    pub diff: DiffStat,
31    pub is_merge: bool,
32}
33
34enum Backend {
35    // Boxed because a `ThreadSafeRepository` is far larger than the null variant.
36    Real(Box<gix::ThreadSafeRepository>),
37    Null(Vec<NulledCommit>),
38}
39
40/// A git repository, or a nulled stand-in for it.
41pub struct Repo {
42    backend: Backend,
43}
44
45impl Repo {
46    /// Open the repository containing `path`, searching upward like git does.
47    ///
48    /// # Errors
49    ///
50    /// Returns an error if no git repository can be discovered from `path`.
51    pub fn open(path: impl AsRef<std::path::Path>) -> Result<Self> {
52        let repo = gix::discover(path).map_err(|e| Error::OpenRepository(Box::new(e)))?;
53        Ok(Self {
54            backend: Backend::Real(Box::new(repo.into_sync())),
55        })
56    }
57
58    /// Build a nulled repository whose [`walk`](Self::walk) returns the given
59    /// commits verbatim. The revision range is ignored, since range resolution
60    /// is real-git behavior exercised separately by integration tests.
61    #[must_use]
62    pub fn create_null(commits: Vec<NulledCommit>) -> Self {
63        Self {
64            backend: Backend::Null(commits),
65        }
66    }
67
68    /// Read commit metadata for every commit in `range`. Trailers are parsed
69    /// only when `need_trailers` is set, since they feed only the reviews table
70    /// and decoding every commit message is costly on large histories.
71    ///
72    /// # Errors
73    ///
74    /// Returns an error if the range cannot be resolved or a commit object
75    /// cannot be read.
76    pub fn walk(&self, range: &str, need_trailers: bool) -> Result<Vec<WalkedCommit>> {
77        match &self.backend {
78            Backend::Real(tsr) => {
79                let mut repo = tsr.to_thread_local();
80                // Match `numstats`: an object cache lets the metadata read reuse
81                // commits the traversal already decoded instead of re-reading packs.
82                repo.object_cache_size_if_unset(8 * 1024 * 1024);
83                walk_real(&repo, range, need_trailers)
84            }
85            Backend::Null(commits) => Ok(commits
86                .iter()
87                .map(|c| WalkedCommit {
88                    meta: c.meta.clone(),
89                    is_merge: c.is_merge,
90                    handle: Handle::Null(c.diff),
91                })
92                .collect()),
93        }
94    }
95
96    /// Compute the numstat for each given commit. Merge commits contribute
97    /// nothing, matching `git log --numstat`'s default.
98    ///
99    /// # Errors
100    ///
101    /// Returns an error if a commit's tree or its parent's tree cannot be
102    /// diffed.
103    pub fn numstats(&self, commits: &[&WalkedCommit]) -> Result<Vec<DiffStat>> {
104        match &self.backend {
105            Backend::Real(tsr) => commits
106                .par_iter()
107                .map_init(
108                    || Worker::new(tsr.to_thread_local()),
109                    |worker, c| worker.numstat(c),
110                )
111                .collect(),
112            Backend::Null(_) => Ok(commits.iter().map(|c| null_diffstat(c)).collect()),
113        }
114    }
115}
116
117/// Per-rayon-worker state for the real backend: a thread-local repository and
118/// two diff resource caches built once and reused for every commit this worker
119/// handles. `gix::Tree::stats` would instead build a fresh cache per call, which
120/// re-parses the on-disk index and reassembles the attribute stack each time; on
121/// a large history that rebuild, not the diff itself, dominates the runtime.
122struct Worker {
123    repo: gix::Repository,
124    // `for_each_to_obtain_tree_with_cache` holds one cache for the tree walk
125    // while each change's line diff needs another, so a worker keeps two. They
126    // are built lazily because the cache constructor is fallible and rayon's
127    // `map_init` initializer cannot return a `Result`.
128    caches: Option<(gix::diff::blob::Platform, gix::diff::blob::Platform)>,
129}
130
131impl Worker {
132    fn new(mut repo: gix::Repository) -> Self {
133        // Per-thread object cache so repeated tree/blob reads during diffing hit
134        // memory instead of the pack. 8 MiB comfortably holds a commit's trees.
135        repo.object_cache_size_if_unset(8 * 1024 * 1024);
136        Self { repo, caches: None }
137    }
138
139    fn numstat(&mut self, commit: &WalkedCommit) -> Result<DiffStat> {
140        if commit.is_merge {
141            return Ok(DiffStat::default());
142        }
143        let id = match &commit.handle {
144            Handle::Real(id) => *id,
145            // A real backend never yields a null handle; nothing to diff.
146            Handle::Null(_) => return Ok(DiffStat::default()),
147        };
148        if self.caches.is_none() {
149            let walk = self
150                .repo
151                .diff_resource_cache_for_tree_diff()
152                .map_err(|e| Error::DiffStats(Box::new(e)))?;
153            let count = self
154                .repo
155                .diff_resource_cache_for_tree_diff()
156                .map_err(|e| Error::DiffStats(Box::new(e)))?;
157            self.caches = Some((walk, count));
158        }
159        let Self { repo, caches } = self;
160        let (walk_cache, count_cache) = caches.as_mut().expect("initialized above");
161        numstat_real(repo, walk_cache, count_cache, id)
162    }
163}
164
165/// A canned (null-backend) commit's numstat: its stored value, or nothing for a
166/// merge. Real handles never reach the null backend.
167fn null_diffstat(commit: &WalkedCommit) -> DiffStat {
168    if commit.is_merge {
169        return DiffStat::default();
170    }
171    match &commit.handle {
172        Handle::Null(diff) => *diff,
173        Handle::Real(_) => DiffStat::default(),
174    }
175}
176
177fn walk_real(repo: &gix::Repository, range: &str, need_trailers: bool) -> Result<Vec<WalkedCommit>> {
178    let (tips, hidden) = resolve_range(repo, range)?;
179    let mailmap = repo.open_mailmap();
180    let walk = repo
181        .rev_walk(tips)
182        .with_hidden(hidden)
183        .all()
184        .map_err(|e| Error::WalkRange {
185            range: range.to_string(),
186            source: Box::new(e),
187        })?;
188    let mut out = Vec::new();
189    for info in walk {
190        let info = info.map_err(|e| Error::ReadCommit(Box::new(e)))?;
191        let commit = repo
192            .find_commit(info.id)
193            .map_err(|e| Error::ReadCommit(Box::new(e)))?;
194        let is_merge = commit.parent_ids().take(2).count() > 1;
195        out.push(WalkedCommit {
196            meta: commit_meta(&commit, &mailmap, need_trailers)?,
197            is_merge,
198            handle: Handle::Real(info.id),
199        });
200    }
201    Ok(out)
202}
203
204fn commit_meta(
205    commit: &gix::Commit,
206    mailmap: &gix::mailmap::Snapshot,
207    need_trailers: bool,
208) -> Result<CommitMeta> {
209    let author = mailmap.resolve(
210        commit
211            .author()
212            .map_err(|e| Error::ReadCommit(Box::new(e)))?,
213    );
214    let time_seconds = commit
215        .committer()
216        .map_err(|e| Error::ReadCommit(Box::new(e)))?
217        .seconds();
218    let trailers = if need_trailers {
219        parse_trailers(commit)?
220    } else {
221        Vec::new()
222    };
223    Ok(CommitMeta {
224        author: Author {
225            name: author.name.to_string(),
226            email: author.email.to_string(),
227        },
228        time_seconds,
229        trailers,
230    })
231}
232
233fn parse_trailers(commit: &gix::Commit) -> Result<Vec<Trailer>> {
234    let message = commit
235        .message()
236        .map_err(|e| Error::ReadCommit(Box::new(e)))?;
237    let Some(body) = message.body() else {
238        return Ok(Vec::new());
239    };
240    Ok(body
241        .trailers()
242        .map(|t| Trailer {
243            token: t.token.to_string(),
244            value: t.value.to_string(),
245        })
246        .collect())
247}
248
249fn numstat_real(
250    repo: &gix::Repository,
251    walk_cache: &mut gix::diff::blob::Platform,
252    count_cache: &mut gix::diff::blob::Platform,
253    id: gix::ObjectId,
254) -> Result<DiffStat> {
255    let commit = repo
256        .find_commit(id)
257        .map_err(|e| Error::ReadCommit(Box::new(e)))?;
258    let new_tree = commit.tree().map_err(|e| Error::ReadCommit(Box::new(e)))?;
259    let old_tree = match commit.parent_ids().next() {
260        Some(parent) => repo
261            .find_commit(parent.detach())
262            .map_err(|e| Error::ReadCommit(Box::new(e)))?
263            .tree()
264            .map_err(|e| Error::ReadCommit(Box::new(e)))?,
265        None => repo.empty_tree(),
266    };
267
268    // Equivalent to `gix::Tree::stats`, but driving caller-owned caches so the
269    // index and attribute stack are read once per worker rather than per commit.
270    let (mut files, mut insertions, mut deletions) = (0u64, 0u64, 0u64);
271    old_tree
272        .changes()
273        .map_err(|e| Error::DiffStats(Box::new(e)))?
274        .for_each_to_obtain_tree_with_cache(&new_tree, walk_cache, |change| {
275            if let Some(counts) = change
276                .diff(count_cache)
277                .ok()
278                .and_then(|mut platform| platform.line_counts().ok())
279                .flatten()
280            {
281                files += 1;
282                insertions += u64::from(counts.insertions);
283                deletions += u64::from(counts.removals);
284            }
285            // The resource cache only grows; clear it between changes to bound memory.
286            count_cache.clear_resource_cache_keep_allocation();
287            Ok::<_, std::convert::Infallible>(std::ops::ControlFlow::Continue(()))
288        })
289        .map_err(|e| Error::DiffStats(Box::new(e)))?;
290    walk_cache.clear_resource_cache_keep_allocation();
291
292    Ok(DiffStat {
293        insertions,
294        deletions,
295        files,
296    })
297}
298
299/// Tips to walk from, and commits to hide, for a revision range.
300type RangeEnds = (Vec<gix::ObjectId>, Vec<gix::ObjectId>);
301
302/// Resolve a revision range into walk tips and hidden commits. Each endpoint is
303/// fully git-spelled (refs, short hashes, `@{n}`, ...); only the `..`/`...`
304/// operators are interpreted here. Exotic gitrevisions(7) forms are unsupported.
305fn resolve_range(repo: &gix::Repository, range: &str) -> Result<RangeEnds> {
306    if let Some((a, b)) = range.split_once("...") {
307        let a = single(repo, default_head(a))?;
308        let b = single(repo, default_head(b))?;
309        return Ok((vec![a, b], merge_bases(repo, a, b)));
310    }
311    if let Some((a, b)) = range.split_once("..") {
312        let excluded = single(repo, default_head(a))?;
313        let included = single(repo, default_head(b))?;
314        return Ok((vec![included], vec![excluded]));
315    }
316    Ok((vec![single(repo, range)?], Vec::new()))
317}
318
319fn default_head(rev: &str) -> &str {
320    if rev.is_empty() { "HEAD" } else { rev }
321}
322
323fn single(repo: &gix::Repository, rev: &str) -> Result<gix::ObjectId> {
324    Ok(repo
325        .rev_parse_single(rev)
326        .map_err(|e| Error::ResolveRevision {
327            revision: rev.to_string(),
328            source: Box::new(e),
329        })?
330        .detach())
331}
332
333fn merge_bases(repo: &gix::Repository, a: gix::ObjectId, b: gix::ObjectId) -> Vec<gix::ObjectId> {
334    // Disjoint histories have no merge base; then nothing is hidden and the
335    // symmetric difference is simply everything reachable from either tip.
336    match repo.merge_base(a, b) {
337        Ok(base) => vec![base.detach()],
338        Err(_) => Vec::new(),
339    }
340}
341
342/// Parse a `--since`/`--until` style date into seconds since the Unix epoch.
343/// Accepts ISO 8601, RFC 2822, unix timestamps, and relative spellings like
344/// "2 weeks ago" (a documented subset of git's approxidate).
345///
346/// # Errors
347///
348/// Returns an error if the input is not a recognized date format.
349pub fn parse_date(input: Option<&str>) -> Result<Option<i64>> {
350    let Some(s) = input else { return Ok(None) };
351    let now = std::time::SystemTime::now();
352    let time = gix::date::parse(s, Some(now)).map_err(|e| Error::InvalidDate {
353        input: s.to_string(),
354        message: e.to_string(),
355    })?;
356    Ok(Some(time.seconds))
357}
358
359#[cfg(test)]
360mod tests {
361    use super::parse_date;
362
363    #[test]
364    fn parse_date_returns_none_for_no_input() {
365        assert_eq!(parse_date(None).unwrap(), None);
366    }
367
368    #[test]
369    fn parse_date_accepts_an_iso_date() {
370        assert!(parse_date(Some("2020-01-01")).unwrap().is_some());
371    }
372
373    #[test]
374    fn parse_date_rejects_unparseable_input() {
375        assert!(parse_date(Some("not-a-real-date")).is_err());
376    }
377}