git-stats 0.2.1

A tool for getting aggregated commit stats
Documentation
//! Infrastructure: the only module that talks to gix and the filesystem.
//!
//! [`Repo`] has two constructors. [`Repo::open`] reads a real repository;
//! [`Repo::create_null`] returns an in-memory stand-in driven by canned commits,
//! so application and logic code can be tested without touching git.

use rayon::prelude::*;

use crate::error::{Error, Result};
use crate::model::{Author, CommitMeta, DiffStat, Trailer};

/// A commit yielded by [`Repo::walk`]: its logic-facing metadata plus an opaque
/// handle the wrapper uses to compute the numstat later.
pub struct WalkedCommit {
    pub meta: CommitMeta,
    is_merge: bool,
    handle: Handle,
}

/// How to obtain a commit's numstat: from gix (real) or from a canned value (null).
enum Handle {
    Real(gix::ObjectId),
    Null(DiffStat),
}

/// A canned commit used to build a nulled [`Repo`] in tests.
#[derive(Debug, Clone)]
pub struct NulledCommit {
    pub meta: CommitMeta,
    pub diff: DiffStat,
    pub is_merge: bool,
}

enum Backend {
    // Boxed because a `ThreadSafeRepository` is far larger than the null variant.
    Real(Box<gix::ThreadSafeRepository>),
    Null(Vec<NulledCommit>),
}

/// A git repository, or a nulled stand-in for it.
pub struct Repo {
    backend: Backend,
}

impl Repo {
    /// Open the repository containing `path`, searching upward like git does.
    ///
    /// # Errors
    ///
    /// Returns an error if no git repository can be discovered from `path`.
    pub fn open(path: impl AsRef<std::path::Path>) -> Result<Self> {
        let repo = gix::discover(path).map_err(|e| Error::OpenRepository(Box::new(e)))?;
        Ok(Self {
            backend: Backend::Real(Box::new(repo.into_sync())),
        })
    }

    /// Build a nulled repository whose [`walk`](Self::walk) returns the given
    /// commits verbatim. The revision range is ignored, since range resolution
    /// is real-git behavior exercised separately by integration tests.
    #[must_use]
    pub fn create_null(commits: Vec<NulledCommit>) -> Self {
        Self {
            backend: Backend::Null(commits),
        }
    }

    /// Read commit metadata for every commit in `range`. Trailers are parsed
    /// only when `need_trailers` is set, since they feed only the reviews table
    /// and decoding every commit message is costly on large histories.
    ///
    /// # Errors
    ///
    /// Returns an error if the range cannot be resolved or a commit object
    /// cannot be read.
    pub fn walk(&self, range: &str, need_trailers: bool) -> Result<Vec<WalkedCommit>> {
        match &self.backend {
            Backend::Real(tsr) => {
                let mut repo = tsr.to_thread_local();
                // Match `numstats`: an object cache lets the metadata read reuse
                // commits the traversal already decoded instead of re-reading packs.
                repo.object_cache_size_if_unset(8 * 1024 * 1024);
                walk_real(&repo, range, need_trailers)
            }
            Backend::Null(commits) => Ok(commits
                .iter()
                .map(|c| WalkedCommit {
                    meta: c.meta.clone(),
                    is_merge: c.is_merge,
                    handle: Handle::Null(c.diff),
                })
                .collect()),
        }
    }

    /// Compute the numstat for each given commit. Merge commits contribute
    /// nothing, matching `git log --numstat`'s default.
    ///
    /// # Errors
    ///
    /// Returns an error if a commit's tree or its parent's tree cannot be
    /// diffed.
    pub fn numstats(&self, commits: &[&WalkedCommit]) -> Result<Vec<DiffStat>> {
        match &self.backend {
            Backend::Real(tsr) => commits
                .par_iter()
                .map_init(
                    || Worker::new(tsr.to_thread_local()),
                    |worker, c| worker.numstat(c),
                )
                .collect(),
            Backend::Null(_) => Ok(commits.iter().map(|c| null_diffstat(c)).collect()),
        }
    }
}

/// Per-rayon-worker state for the real backend: a thread-local repository and
/// two diff resource caches built once and reused for every commit this worker
/// handles. `gix::Tree::stats` would instead build a fresh cache per call, which
/// re-parses the on-disk index and reassembles the attribute stack each time; on
/// a large history that rebuild, not the diff itself, dominates the runtime.
struct Worker {
    repo: gix::Repository,
    // `for_each_to_obtain_tree_with_cache` holds one cache for the tree walk
    // while each change's line diff needs another, so a worker keeps two. They
    // are built lazily because the cache constructor is fallible and rayon's
    // `map_init` initializer cannot return a `Result`.
    caches: Option<(gix::diff::blob::Platform, gix::diff::blob::Platform)>,
}

impl Worker {
    fn new(mut repo: gix::Repository) -> Self {
        // Per-thread object cache so repeated tree/blob reads during diffing hit
        // memory instead of the pack. 8 MiB comfortably holds a commit's trees.
        repo.object_cache_size_if_unset(8 * 1024 * 1024);
        Self { repo, caches: None }
    }

    fn numstat(&mut self, commit: &WalkedCommit) -> Result<DiffStat> {
        if commit.is_merge {
            return Ok(DiffStat::default());
        }
        let id = match &commit.handle {
            Handle::Real(id) => *id,
            // A real backend never yields a null handle; nothing to diff.
            Handle::Null(_) => return Ok(DiffStat::default()),
        };
        if self.caches.is_none() {
            let walk = self
                .repo
                .diff_resource_cache_for_tree_diff()
                .map_err(|e| Error::DiffStats(Box::new(e)))?;
            let count = self
                .repo
                .diff_resource_cache_for_tree_diff()
                .map_err(|e| Error::DiffStats(Box::new(e)))?;
            self.caches = Some((walk, count));
        }
        let Self { repo, caches } = self;
        let (walk_cache, count_cache) = caches.as_mut().expect("initialized above");
        numstat_real(repo, walk_cache, count_cache, id)
    }
}

/// A canned (null-backend) commit's numstat: its stored value, or nothing for a
/// merge. Real handles never reach the null backend.
fn null_diffstat(commit: &WalkedCommit) -> DiffStat {
    if commit.is_merge {
        return DiffStat::default();
    }
    match &commit.handle {
        Handle::Null(diff) => *diff,
        Handle::Real(_) => DiffStat::default(),
    }
}

fn walk_real(repo: &gix::Repository, range: &str, need_trailers: bool) -> Result<Vec<WalkedCommit>> {
    let (tips, hidden) = resolve_range(repo, range)?;
    let mailmap = repo.open_mailmap();
    let walk = repo
        .rev_walk(tips)
        .with_hidden(hidden)
        .all()
        .map_err(|e| Error::WalkRange {
            range: range.to_string(),
            source: Box::new(e),
        })?;
    let mut out = Vec::new();
    for info in walk {
        let info = info.map_err(|e| Error::ReadCommit(Box::new(e)))?;
        let commit = repo
            .find_commit(info.id)
            .map_err(|e| Error::ReadCommit(Box::new(e)))?;
        let is_merge = commit.parent_ids().take(2).count() > 1;
        out.push(WalkedCommit {
            meta: commit_meta(&commit, &mailmap, need_trailers)?,
            is_merge,
            handle: Handle::Real(info.id),
        });
    }
    Ok(out)
}

fn commit_meta(
    commit: &gix::Commit,
    mailmap: &gix::mailmap::Snapshot,
    need_trailers: bool,
) -> Result<CommitMeta> {
    let author = mailmap.resolve(
        commit
            .author()
            .map_err(|e| Error::ReadCommit(Box::new(e)))?,
    );
    let time_seconds = commit
        .committer()
        .map_err(|e| Error::ReadCommit(Box::new(e)))?
        .seconds();
    let trailers = if need_trailers {
        parse_trailers(commit)?
    } else {
        Vec::new()
    };
    Ok(CommitMeta {
        author: Author {
            name: author.name.to_string(),
            email: author.email.to_string(),
        },
        time_seconds,
        trailers,
    })
}

fn parse_trailers(commit: &gix::Commit) -> Result<Vec<Trailer>> {
    let message = commit
        .message()
        .map_err(|e| Error::ReadCommit(Box::new(e)))?;
    let Some(body) = message.body() else {
        return Ok(Vec::new());
    };
    Ok(body
        .trailers()
        .map(|t| Trailer {
            token: t.token.to_string(),
            value: t.value.to_string(),
        })
        .collect())
}

fn numstat_real(
    repo: &gix::Repository,
    walk_cache: &mut gix::diff::blob::Platform,
    count_cache: &mut gix::diff::blob::Platform,
    id: gix::ObjectId,
) -> Result<DiffStat> {
    let commit = repo
        .find_commit(id)
        .map_err(|e| Error::ReadCommit(Box::new(e)))?;
    let new_tree = commit.tree().map_err(|e| Error::ReadCommit(Box::new(e)))?;
    let old_tree = match commit.parent_ids().next() {
        Some(parent) => repo
            .find_commit(parent.detach())
            .map_err(|e| Error::ReadCommit(Box::new(e)))?
            .tree()
            .map_err(|e| Error::ReadCommit(Box::new(e)))?,
        None => repo.empty_tree(),
    };

    // Equivalent to `gix::Tree::stats`, but driving caller-owned caches so the
    // index and attribute stack are read once per worker rather than per commit.
    let (mut files, mut insertions, mut deletions) = (0u64, 0u64, 0u64);
    old_tree
        .changes()
        .map_err(|e| Error::DiffStats(Box::new(e)))?
        .for_each_to_obtain_tree_with_cache(&new_tree, walk_cache, |change| {
            if let Some(counts) = change
                .diff(count_cache)
                .ok()
                .and_then(|mut platform| platform.line_counts().ok())
                .flatten()
            {
                files += 1;
                insertions += u64::from(counts.insertions);
                deletions += u64::from(counts.removals);
            }
            // The resource cache only grows; clear it between changes to bound memory.
            count_cache.clear_resource_cache_keep_allocation();
            Ok::<_, std::convert::Infallible>(std::ops::ControlFlow::Continue(()))
        })
        .map_err(|e| Error::DiffStats(Box::new(e)))?;
    walk_cache.clear_resource_cache_keep_allocation();

    Ok(DiffStat {
        insertions,
        deletions,
        files,
    })
}

/// Tips to walk from, and commits to hide, for a revision range.
type RangeEnds = (Vec<gix::ObjectId>, Vec<gix::ObjectId>);

/// Resolve a revision range into walk tips and hidden commits. Each endpoint is
/// fully git-spelled (refs, short hashes, `@{n}`, ...); only the `..`/`...`
/// operators are interpreted here. Exotic gitrevisions(7) forms are unsupported.
fn resolve_range(repo: &gix::Repository, range: &str) -> Result<RangeEnds> {
    if let Some((a, b)) = range.split_once("...") {
        let a = single(repo, default_head(a))?;
        let b = single(repo, default_head(b))?;
        return Ok((vec![a, b], merge_bases(repo, a, b)));
    }
    if let Some((a, b)) = range.split_once("..") {
        let excluded = single(repo, default_head(a))?;
        let included = single(repo, default_head(b))?;
        return Ok((vec![included], vec![excluded]));
    }
    Ok((vec![single(repo, range)?], Vec::new()))
}

fn default_head(rev: &str) -> &str {
    if rev.is_empty() { "HEAD" } else { rev }
}

fn single(repo: &gix::Repository, rev: &str) -> Result<gix::ObjectId> {
    Ok(repo
        .rev_parse_single(rev)
        .map_err(|e| Error::ResolveRevision {
            revision: rev.to_string(),
            source: Box::new(e),
        })?
        .detach())
}

fn merge_bases(repo: &gix::Repository, a: gix::ObjectId, b: gix::ObjectId) -> Vec<gix::ObjectId> {
    // Disjoint histories have no merge base; then nothing is hidden and the
    // symmetric difference is simply everything reachable from either tip.
    match repo.merge_base(a, b) {
        Ok(base) => vec![base.detach()],
        Err(_) => Vec::new(),
    }
}

/// Parse a `--since`/`--until` style date into seconds since the Unix epoch.
/// Accepts ISO 8601, RFC 2822, unix timestamps, and relative spellings like
/// "2 weeks ago" (a documented subset of git's approxidate).
///
/// # Errors
///
/// Returns an error if the input is not a recognized date format.
pub fn parse_date(input: Option<&str>) -> Result<Option<i64>> {
    let Some(s) = input else { return Ok(None) };
    let now = std::time::SystemTime::now();
    let time = gix::date::parse(s, Some(now)).map_err(|e| Error::InvalidDate {
        input: s.to_string(),
        message: e.to_string(),
    })?;
    Ok(Some(time.seconds))
}

#[cfg(test)]
mod tests {
    use super::parse_date;

    #[test]
    fn parse_date_returns_none_for_no_input() {
        assert_eq!(parse_date(None).unwrap(), None);
    }

    #[test]
    fn parse_date_accepts_an_iso_date() {
        assert!(parse_date(Some("2020-01-01")).unwrap().is_some());
    }

    #[test]
    fn parse_date_rejects_unparseable_input() {
        assert!(parse_date(Some("not-a-real-date")).is_err());
    }
}