use std::path::Path;
use std::process::Command;
use super::TrainDataError;
#[derive(Debug, Clone)]
pub struct CommitInfo {
pub sha: String,
pub message: String,
pub date: String,
}
pub fn git_log(repo: &Path, max_commits: usize) -> Result<Vec<CommitInfo>, TrainDataError> {
let _span = tracing::info_span!("git_log", repo = %repo.display(), max_commits).entered();
let mut cmd = Command::new("git");
cmd.args(["-C"])
.arg(repo)
.args(["log", "--format=%H%x00%s%x00%aI", "--no-merges"]);
if max_commits > 0 {
cmd.args(["-n", &max_commits.to_string()]);
}
let output = cmd.output().map_err(|e| {
tracing::warn!(error = %e, "Failed to spawn git log");
e
})?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
return Err(TrainDataError::Git(format!(
"git log failed: {}",
stderr.trim()
)));
}
let stdout = String::from_utf8_lossy(&output.stdout);
let mut commits = Vec::new();
for line in stdout.lines() {
let line = line.trim();
if line.is_empty() {
continue;
}
let parts: Vec<&str> = line.splitn(3, '\0').collect();
if parts.len() != 3 {
tracing::warn!(
line,
"Skipping malformed git log line (expected 3 NUL-separated fields)"
);
continue;
}
commits.push(CommitInfo {
sha: parts[0].to_string(),
message: parts[1].to_string(),
date: parts[2].to_string(),
});
}
tracing::debug!(count = commits.len(), "Parsed git log commits");
if commits.len() > 100_000 {
tracing::warn!(
count = commits.len(),
"git_log returned >100K commits — consider setting max_commits to limit memory usage"
);
}
Ok(commits)
}
pub fn git_diff_tree(repo: &Path, sha: &str) -> Result<String, TrainDataError> {
let _span = tracing::info_span!("git_diff_tree", repo = %repo.display(), sha).entered();
if sha.starts_with('-') || sha.contains('\0') {
return Err(TrainDataError::Git(format!(
"Invalid SHA '{}': must not start with '-' or contain null bytes",
sha
)));
}
let output = Command::new("git")
.args(["-C"])
.arg(repo)
.args(["diff-tree", "--root", "--no-commit-id", "-r", "-p", sha])
.output()
.map_err(|e| {
tracing::warn!(error = %e, "Failed to spawn git diff-tree");
e
})?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
return Err(TrainDataError::Git(format!(
"git diff-tree failed for {}: {}",
sha,
stderr.trim()
)));
}
Ok(String::from_utf8_lossy(&output.stdout).to_string())
}
const MAX_SHOW_SIZE: usize = 50 * 1024 * 1024;
pub fn git_show(repo: &Path, sha: &str, path: &str) -> Result<Option<String>, TrainDataError> {
let _span = tracing::info_span!("git_show", repo = %repo.display(), sha, path).entered();
if sha.starts_with('-') || sha.contains('\0') {
return Err(TrainDataError::Git(format!(
"Invalid SHA '{}': must not start with '-' or contain null bytes",
sha
)));
}
if path.starts_with('-') || path.contains('\0') {
return Err(TrainDataError::Git(format!(
"Invalid path '{}': must not start with '-' or contain null bytes",
path
)));
}
if path.contains(':') {
return Err(TrainDataError::Git(format!(
"Invalid path '{}': must not contain ':' (reserved for git rev:path syntax)",
path
)));
}
let spec = format!("{}:{}", sha, path);
let output = Command::new("git")
.args(["-C"])
.arg(repo)
.args(["show", &spec])
.output()
.map_err(|e| {
tracing::warn!(error = %e, "Failed to spawn git show");
e
})?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
return Err(TrainDataError::Git(format!(
"git show failed for {}: {}",
spec,
stderr.trim()
)));
}
if output.stdout.len() > MAX_SHOW_SIZE {
tracing::debug!(
size = output.stdout.len(),
max = MAX_SHOW_SIZE,
"Skipping oversized file"
);
return Ok(None);
}
match String::from_utf8(output.stdout) {
Ok(content) => Ok(Some(content)),
Err(_) => {
tracing::debug!(path, "Skipping non-UTF-8 file");
Ok(None)
}
}
}
pub fn is_shallow(repo: &Path) -> bool {
let _span = tracing::info_span!("is_shallow", repo = %repo.display()).entered();
let output = match Command::new("git")
.args(["-C"])
.arg(repo)
.args(["rev-parse", "--is-shallow-repository"])
.output()
{
Ok(o) => o,
Err(e) => {
tracing::warn!(error = %e, "Failed to check shallow status");
return false;
}
};
let stdout = String::from_utf8_lossy(&output.stdout);
stdout.trim() == "true"
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::TempDir;
fn create_test_repo() -> TempDir {
let dir = TempDir::new().unwrap();
let repo = dir.path();
let status = Command::new("git")
.args(["-C"])
.arg(repo)
.args(["init"])
.output()
.unwrap();
assert!(status.status.success(), "git init failed");
Command::new("git")
.args(["-C"])
.arg(repo)
.args(["config", "user.email", "test@test.com"])
.output()
.unwrap();
Command::new("git")
.args(["-C"])
.arg(repo)
.args(["config", "user.name", "Test"])
.output()
.unwrap();
std::fs::write(repo.join("test.rs"), "fn hello() { println!(\"hi\"); }\n").unwrap();
Command::new("git")
.args(["-C"])
.arg(repo)
.args(["add", "."])
.output()
.unwrap();
Command::new("git")
.args(["-C"])
.arg(repo)
.args(["commit", "-m", "initial commit"])
.output()
.unwrap();
dir
}
fn create_test_repo_with_change() -> TempDir {
let dir = create_test_repo();
let repo = dir.path();
std::fs::write(
repo.join("test.rs"),
"fn hello() { println!(\"hello world\"); }\nfn goodbye() { }\n",
)
.unwrap();
Command::new("git")
.args(["-C"])
.arg(repo)
.args(["add", "."])
.output()
.unwrap();
Command::new("git")
.args(["-C"])
.arg(repo)
.args(["commit", "-m", "update hello and add goodbye"])
.output()
.unwrap();
dir
}
#[test]
fn git_log_on_test_repo() {
let dir = create_test_repo();
let commits = git_log(dir.path(), 0).unwrap();
assert!(!commits.is_empty());
assert!(!commits[0].sha.is_empty());
assert!(!commits[0].message.is_empty());
assert!(!commits[0].date.is_empty());
}
#[test]
fn git_log_respects_max_commits() {
let dir = create_test_repo_with_change();
let all = git_log(dir.path(), 0).unwrap();
assert_eq!(all.len(), 2);
let limited = git_log(dir.path(), 1).unwrap();
assert_eq!(limited.len(), 1);
assert_eq!(limited[0].sha, all[0].sha);
}
#[test]
fn git_log_returns_iso_date() {
let dir = create_test_repo();
let commits = git_log(dir.path(), 0).unwrap();
assert!(
commits[0].date.contains('T') || commits[0].date.contains('-'),
"Expected ISO date, got: {}",
commits[0].date
);
}
#[test]
fn git_diff_tree_on_test_repo() {
let dir = create_test_repo_with_change();
let commits = git_log(dir.path(), 0).unwrap();
let diff = git_diff_tree(dir.path(), &commits[0].sha).unwrap();
assert!(diff.contains("test.rs"), "diff should reference test.rs");
assert!(diff.contains("@@"), "diff should contain hunk headers");
}
#[test]
fn git_diff_tree_initial_commit() {
let dir = create_test_repo();
let commits = git_log(dir.path(), 0).unwrap();
let diff = git_diff_tree(dir.path(), &commits[0].sha).unwrap();
assert!(
diff.contains("test.rs"),
"initial commit diff should reference test.rs"
);
}
#[test]
fn git_show_returns_content() {
let dir = create_test_repo();
let commits = git_log(dir.path(), 0).unwrap();
let content = git_show(dir.path(), &commits[0].sha, "test.rs").unwrap();
assert!(content.is_some());
assert!(content.unwrap().contains("fn hello"));
}
#[test]
fn git_show_nonexistent_file_errors() {
let dir = create_test_repo();
let commits = git_log(dir.path(), 0).unwrap();
let result = git_show(dir.path(), &commits[0].sha, "nonexistent.rs");
assert!(result.is_err(), "Should error for nonexistent file");
}
#[test]
fn is_shallow_on_normal_repo() {
let dir = create_test_repo();
assert!(!is_shallow(dir.path()));
}
#[test]
fn is_shallow_on_nonexistent_path() {
assert!(!is_shallow(Path::new("/nonexistent/repo/path")));
}
}