use anyhow::{Context, Result};
use git2::{DiffOptions, Repository, Sort};
use std::collections::HashSet;
use std::path::{Path, PathBuf};
#[derive(Debug, Clone)]
pub struct CommitInfo {
pub hash: String,
pub message: String,
pub author_name: String,
pub author_email: String,
pub commit_date: i64,
pub files_changed: Vec<String>,
pub diff_content: String,
pub parent_hashes: Vec<String>,
}
pub struct GitWalker {
repo: Repository,
repo_path: PathBuf,
}
impl GitWalker {
pub fn discover<P: AsRef<Path>>(path: P) -> Result<Self> {
let path = path.as_ref();
let repo = Repository::discover(path).context("Failed to discover git repository")?;
let repo_path = repo
.workdir()
.context("Repository has no working directory (bare repository)")?
.to_path_buf();
tracing::info!("Opened git repository at: {}", repo_path.display());
Ok(Self { repo, repo_path })
}
pub fn repo_path(&self) -> &Path {
&self.repo_path
}
pub fn current_branch(&self) -> Option<String> {
self.repo.head().ok()?.shorthand().map(|s| s.to_string())
}
pub fn iter_commits(
&self,
branch: Option<&str>,
max_count: Option<usize>,
since_date: Option<i64>,
until_date: Option<i64>,
skip_hashes: &HashSet<String>,
) -> Result<Vec<CommitInfo>> {
let mut revwalk = self.repo.revwalk()?;
revwalk.set_sorting(Sort::TIME | Sort::TOPOLOGICAL)?;
if let Some(branch_name) = branch {
let reference = self
.repo
.find_branch(branch_name, git2::BranchType::Local)
.context("Failed to find branch")?;
let oid = reference.get().target().context("Branch has no target")?;
revwalk.push(oid)?;
} else {
revwalk.push_head()?;
}
let mut commits = Vec::new();
let mut count = 0;
let max = max_count.unwrap_or(usize::MAX);
for oid in revwalk {
if count >= max {
break;
}
let oid = oid?;
let commit = self.repo.find_commit(oid)?;
let commit_hash = format!("{}", commit.id());
if skip_hashes.contains(&commit_hash) {
tracing::debug!("Skipping already indexed commit: {}", commit_hash);
continue;
}
let commit_time = commit.time().seconds();
if let Some(since) = since_date
&& commit_time < since
{
break; }
if let Some(until) = until_date
&& commit_time > until
{
continue;
}
let commit_info = self.extract_commit_info(&commit)?;
commits.push(commit_info);
count += 1;
if count % 50 == 0 {
tracing::debug!("Processed {} commits", count);
}
}
tracing::info!("Extracted {} new commits", commits.len());
Ok(commits)
}
fn extract_commit_info(&self, commit: &git2::Commit) -> Result<CommitInfo> {
let hash = format!("{}", commit.id());
let message = commit.message().unwrap_or("").to_string();
let author = commit.author();
let author_name = author.name().unwrap_or("Unknown").to_string();
let author_email = author.email().unwrap_or("").to_string();
let commit_date = commit.time().seconds();
let parent_hashes: Vec<String> = commit.parents().map(|p| format!("{}", p.id())).collect();
let (files_changed, diff_content) = self.extract_diff(commit)?;
Ok(CommitInfo {
hash,
message,
author_name,
author_email,
commit_date,
files_changed,
diff_content,
parent_hashes,
})
}
fn extract_diff(&self, commit: &git2::Commit) -> Result<(Vec<String>, String)> {
let mut files_changed = Vec::new();
let mut diff_content = String::new();
let mut diff_truncated = false;
let tree = commit.tree()?;
let parent_tree = if commit.parent_count() > 0 {
Some(commit.parent(0)?.tree()?)
} else {
None
};
let mut diff_opts = DiffOptions::new();
diff_opts
.context_lines(3)
.interhunk_lines(0)
.ignore_whitespace(false);
let diff = if let Some(parent) = parent_tree {
self.repo
.diff_tree_to_tree(Some(&parent), Some(&tree), Some(&mut diff_opts))?
} else {
self.repo
.diff_tree_to_tree(None, Some(&tree), Some(&mut diff_opts))?
};
for delta in diff.deltas() {
if let Some(path) = delta.new_file().path() {
files_changed.push(path.display().to_string());
}
}
diff.print(git2::DiffFormat::Patch, |_delta, _hunk, line| {
if diff_truncated {
return true;
}
if line.origin() == 'B' {
return true;
}
if diff_content.len() >= 100_000 {
diff_truncated = true;
return true; }
let origin = line.origin();
if let Ok(content) = std::str::from_utf8(line.content()) {
match origin {
'+' | '-' | ' ' => {
diff_content.push(origin);
diff_content.push_str(content);
}
'F' => {
diff_content.push_str("--- ");
diff_content.push_str(content);
}
'H' => {
diff_content.push_str(content);
}
_ => {}
}
} else {
tracing::debug!("Skipping diff line with invalid UTF-8");
}
true
})?;
if diff_content.len() > 8000 {
let mut truncate_at = 8000;
while !diff_content.is_char_boundary(truncate_at) {
truncate_at -= 1;
}
diff_content.truncate(truncate_at);
diff_content.push_str("\n\n[... diff truncated ...]");
tracing::warn!("Truncated large diff for commit {}", commit.id());
}
Ok((files_changed, diff_content))
}
pub fn has_commits(&self) -> bool {
self.repo.head().is_ok()
}
pub fn estimate_commit_count(&self) -> Result<usize> {
let mut revwalk = self.repo.revwalk()?;
revwalk.push_head()?;
Ok(revwalk.count())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_discover_current_repo() {
let walker = GitWalker::discover(".").expect("Should find git repo");
assert!(walker.repo_path().exists());
assert!(walker.has_commits());
}
#[test]
fn test_current_branch() {
let walker = GitWalker::discover(".").expect("Should find git repo");
let branch = walker.current_branch();
assert!(branch.is_some(), "Should have a current branch");
}
#[test]
fn test_iter_commits_limited() {
let walker = GitWalker::discover(".").expect("Should find git repo");
let skip = HashSet::new();
let commits = walker
.iter_commits(None, Some(5), None, None, &skip)
.expect("Should iterate commits");
assert!(commits.len() <= 5, "Should respect max_count");
for commit in &commits {
assert!(!commit.hash.is_empty(), "Commit hash should not be empty");
assert!(
!commit.author_name.is_empty(),
"Author name should not be empty"
);
}
}
#[test]
fn test_commit_info_structure() {
let walker = GitWalker::discover(".").expect("Should find git repo");
let skip = HashSet::new();
let commits = walker
.iter_commits(None, Some(1), None, None, &skip)
.expect("Should get commits");
if let Some(commit) = commits.first() {
assert_eq!(commit.hash.len(), 40, "Git SHA should be 40 chars");
assert!(commit.commit_date > 0, "Commit date should be positive");
}
}
#[test]
fn test_skip_hashes() {
let walker = GitWalker::discover(".").expect("Should find git repo");
let skip = HashSet::new();
let commits = walker
.iter_commits(None, Some(1), None, None, &skip)
.expect("Should get commits");
if let Some(first_commit) = commits.first() {
let mut skip_set = HashSet::new();
skip_set.insert(first_commit.hash.clone());
let commits2 = walker
.iter_commits(None, Some(1), None, None, &skip_set)
.expect("Should get commits");
if let Some(second_commit) = commits2.first() {
assert_ne!(
first_commit.hash, second_commit.hash,
"Should skip specified commit"
);
}
}
}
}