use std::collections::HashSet;
use std::path::{Path, PathBuf};
use gix::bstr::ByteSlice;
#[derive(Debug, Clone)]
pub struct GitHistoryConfig {
pub max_commits: usize,
pub since: Option<String>,
pub until: Option<String>,
pub branch: Option<String>,
pub extensions: Vec<String>,
}
impl Default for GitHistoryConfig {
fn default() -> Self {
Self {
max_commits: 0,
since: None,
until: None,
branch: None,
extensions: vec!["rs".into()],
}
}
}
#[derive(Debug, Clone)]
pub struct HistoricalFile {
pub path: PathBuf,
pub content: String,
pub commit_id: String,
pub commit_summary: String,
pub author: String,
pub timestamp: i64,
pub was_deleted: bool,
}
#[derive(Debug)]
pub struct GitHistoryResult {
pub files: Vec<HistoricalFile>,
pub commits_scanned: usize,
pub errors: Vec<String>,
}
pub struct GitHistoryScanner {
config: GitHistoryConfig,
}
impl GitHistoryScanner {
pub fn new(config: GitHistoryConfig) -> Self {
Self { config }
}
pub fn scan(&self, repo_path: &Path) -> Result<GitHistoryResult, GitHistoryError> {
let repo = gix::open(repo_path).map_err(|e| GitHistoryError::RepoOpen(e.to_string()))?;
let mut result = GitHistoryResult {
files: Vec::new(),
commits_scanned: 0,
errors: Vec::new(),
};
let head = if let Some(ref branch_name) = self.config.branch {
repo.find_reference(&format!("refs/heads/{}", branch_name))
.map_err(|e| GitHistoryError::BranchNotFound(branch_name.clone(), e.to_string()))?
.id()
.detach()
} else {
repo.head_id()
.map_err(|e| GitHistoryError::HeadNotFound(e.to_string()))?
.detach()
};
let since_timestamp = self.parse_date(&self.config.since)?;
let until_timestamp = self.parse_date(&self.config.until)?;
let mut current_files: HashSet<PathBuf> = HashSet::new();
let mut seen_in_history: HashSet<PathBuf> = HashSet::new();
if let Ok(commit) = repo.head_commit() {
if let Ok(tree) = commit.tree() {
self.collect_tree_files(&repo, &tree, PathBuf::new(), &mut current_files);
}
}
let walk = repo
.rev_walk([head])
.all()
.map_err(|e| GitHistoryError::WalkError(e.to_string()))?;
for commit_result in walk {
if self.config.max_commits > 0 && result.commits_scanned >= self.config.max_commits {
break;
}
let commit_info = match commit_result {
Ok(info) => info,
Err(e) => {
result.errors.push(format!("Failed to get commit: {}", e));
continue;
}
};
let commit = match repo.find_commit(commit_info.id) {
Ok(c) => c,
Err(e) => {
result
.errors
.push(format!("Failed to find commit {}: {}", commit_info.id, e));
continue;
}
};
let commit_time = commit.time().map(|t| t.seconds).unwrap_or(0);
if let Some(since) = since_timestamp {
if commit_time < since {
continue; }
}
if let Some(until) = until_timestamp {
if commit_time > until {
continue; }
}
result.commits_scanned += 1;
let commit_id_str = commit_info.id.to_string();
let commit_summary = commit
.message()
.ok()
.and_then(|m| m.title.to_str().ok().map(|s| s.to_string()))
.unwrap_or_default();
let author = commit
.author()
.ok()
.map(|a| a.name.to_str().unwrap_or("Unknown").to_string())
.unwrap_or_else(|| "Unknown".into());
let tree = match commit.tree() {
Ok(t) => t,
Err(e) => {
result
.errors
.push(format!("Failed to get tree for {}: {}", commit_info.id, e));
continue;
}
};
self.extract_files_from_tree(
&repo,
&tree,
PathBuf::new(),
&commit_id_str,
&commit_summary,
&author,
commit_time,
¤t_files,
&mut seen_in_history,
&mut result.files,
);
}
Ok(result)
}
fn collect_tree_files(
&self,
repo: &gix::Repository,
tree: &gix::Tree,
prefix: PathBuf,
files: &mut HashSet<PathBuf>,
) {
for entry in tree.iter() {
let entry = match entry {
Ok(e) => e,
Err(_) => continue,
};
let name = match entry.filename().to_str() {
Ok(n) => n,
Err(_) => continue,
};
let path = prefix.join(name);
match entry.mode().kind() {
gix::object::tree::EntryKind::Blob => {
if self.should_include_file(&path) {
files.insert(path);
}
}
gix::object::tree::EntryKind::Tree => {
if let Ok(obj) = entry.object() {
if let Ok(subtree) = obj.try_into_tree() {
self.collect_tree_files(repo, &subtree, path, files);
}
}
}
_ => {}
}
}
}
#[allow(clippy::too_many_arguments)]
fn extract_files_from_tree(
&self,
repo: &gix::Repository,
tree: &gix::Tree,
prefix: PathBuf,
commit_id: &str,
commit_summary: &str,
author: &str,
timestamp: i64,
current_files: &HashSet<PathBuf>,
seen_in_history: &mut HashSet<PathBuf>,
output: &mut Vec<HistoricalFile>,
) {
for entry in tree.iter() {
let entry = match entry {
Ok(e) => e,
Err(_) => continue,
};
let name = match entry.filename().to_str() {
Ok(n) => n,
Err(_) => continue,
};
let path = prefix.join(name);
match entry.mode().kind() {
gix::object::tree::EntryKind::Blob => {
if !self.should_include_file(&path) {
continue;
}
let was_deleted = !current_files.contains(&path);
if !was_deleted && seen_in_history.contains(&path) {
continue;
}
seen_in_history.insert(path.clone());
let blob_id = entry.id();
let blob = match repo.find_blob(blob_id) {
Ok(b) => b,
Err(_) => continue,
};
let content = match blob.data.to_str() {
Ok(s) => s.to_string(),
Err(_) => continue, };
output.push(HistoricalFile {
path,
content,
commit_id: commit_id.to_string(),
commit_summary: commit_summary.to_string(),
author: author.to_string(),
timestamp,
was_deleted,
});
}
gix::object::tree::EntryKind::Tree => {
if let Ok(obj) = entry.object() {
if let Ok(subtree) = obj.try_into_tree() {
self.extract_files_from_tree(
repo,
&subtree,
path,
commit_id,
commit_summary,
author,
timestamp,
current_files,
seen_in_history,
output,
);
}
}
}
_ => {}
}
}
}
fn should_include_file(&self, path: &Path) -> bool {
if self.config.extensions.is_empty() {
return true;
}
path.extension()
.and_then(|e| e.to_str())
.map(|ext| self.config.extensions.iter().any(|e| e == ext))
.unwrap_or(false)
}
fn parse_date(&self, date: &Option<String>) -> Result<Option<i64>, GitHistoryError> {
match date {
None => Ok(None),
Some(s) => {
let parts: Vec<&str> = s.split('-').collect();
if parts.len() != 3 {
return Err(GitHistoryError::InvalidDate(s.clone()));
}
let year: i32 = parts[0]
.parse()
.map_err(|_| GitHistoryError::InvalidDate(s.clone()))?;
let month: u32 = parts[1]
.parse()
.map_err(|_| GitHistoryError::InvalidDate(s.clone()))?;
let day: u32 = parts[2]
.parse()
.map_err(|_| GitHistoryError::InvalidDate(s.clone()))?;
let days = days_since_epoch(year, month, day);
Ok(Some(days * 86400)) }
}
}
}
fn days_since_epoch(year: i32, month: u32, day: u32) -> i64 {
let mut days: i64 = 0;
for y in 1970..year {
days += if is_leap_year(y) { 366 } else { 365 };
}
let days_in_months = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31];
for m in 1..month {
days += days_in_months[(m - 1) as usize] as i64;
if m == 2 && is_leap_year(year) {
days += 1;
}
}
days += (day - 1) as i64;
days
}
fn is_leap_year(year: i32) -> bool {
(year % 4 == 0 && year % 100 != 0) || (year % 400 == 0)
}
#[derive(Debug, thiserror::Error)]
pub enum GitHistoryError {
#[error("Failed to open repository: {0}")]
RepoOpen(String),
#[error("Branch '{0}' not found: {1}")]
BranchNotFound(String, String),
#[error("Could not find HEAD: {0}")]
HeadNotFound(String),
#[error("Error walking commits: {0}")]
WalkError(String),
#[error("Invalid date format '{0}' (expected YYYY-MM-DD)")]
InvalidDate(String),
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_date_parsing() {
let scanner = GitHistoryScanner::new(GitHistoryConfig::default());
let result = scanner.parse_date(&Some("2024-01-15".into()));
assert!(result.is_ok());
assert!(result.unwrap().is_some());
let result = scanner.parse_date(&None);
assert!(result.is_ok());
assert!(result.unwrap().is_none());
let result = scanner.parse_date(&Some("2024/01/15".into()));
assert!(result.is_err());
}
#[test]
fn test_should_include_file() {
let scanner = GitHistoryScanner::new(GitHistoryConfig {
extensions: vec!["rs".into(), "toml".into()],
..Default::default()
});
assert!(scanner.should_include_file(Path::new("src/main.rs")));
assert!(scanner.should_include_file(Path::new("Cargo.toml")));
assert!(!scanner.should_include_file(Path::new("README.md")));
assert!(!scanner.should_include_file(Path::new("src/lib.py")));
}
#[test]
fn test_days_since_epoch() {
let days = days_since_epoch(2024, 1, 1);
assert!(days > 0);
let days = days_since_epoch(1970, 1, 1);
assert_eq!(days, 0);
}
#[test]
fn test_leap_year() {
assert!(is_leap_year(2000));
assert!(is_leap_year(2024));
assert!(!is_leap_year(2023));
assert!(!is_leap_year(1900));
}
}