Skip to main content

argus_gitpulse/
mining.rs

1//! Git history extraction via git2.
2//!
3//! Mines commit history from a repository, extracting per-commit
4//! file changes with line counts, author info, and timestamps.
5
6use std::path::Path;
7
8use argus_core::ArgusError;
9use git2::{Delta, DiffOptions, Repository, Sort};
10
11/// Raw commit data extracted from git history.
12///
13/// # Examples
14///
15/// ```
16/// use argus_gitpulse::mining::CommitInfo;
17///
18/// let info = CommitInfo {
19///     hash: "abc123".into(),
20///     author: "alice".into(),
21///     email: "alice@example.com".into(),
22///     timestamp: 1700000000,
23///     message: "fix: auth bug".into(),
24///     files_changed: vec![],
25/// };
26/// assert_eq!(info.author, "alice");
27/// ```
28#[derive(Debug, Clone)]
29pub struct CommitInfo {
30    /// Short commit hash.
31    pub hash: String,
32    /// Author name.
33    pub author: String,
34    /// Author email.
35    pub email: String,
36    /// Unix timestamp of the commit.
37    pub timestamp: i64,
38    /// First line of commit message.
39    pub message: String,
40    /// Files modified in this commit.
41    pub files_changed: Vec<FileChange>,
42}
43
44/// A single file change within a commit.
45///
46/// # Examples
47///
48/// ```
49/// use argus_gitpulse::mining::{FileChange, ChangeStatus};
50///
51/// let change = FileChange {
52///     path: "src/main.rs".into(),
53///     lines_added: 10,
54///     lines_deleted: 3,
55///     status: ChangeStatus::Modified,
56/// };
57/// assert_eq!(change.lines_added, 10);
58/// ```
59#[derive(Debug, Clone)]
60pub struct FileChange {
61    /// File path relative to repo root.
62    pub path: String,
63    /// Lines added in this commit.
64    pub lines_added: u64,
65    /// Lines deleted in this commit.
66    pub lines_deleted: u64,
67    /// Type of change.
68    pub status: ChangeStatus,
69}
70
71/// Status of a file change within a commit.
72///
73/// # Examples
74///
75/// ```
76/// use argus_gitpulse::mining::ChangeStatus;
77///
78/// let status = ChangeStatus::Added;
79/// assert_eq!(format!("{status:?}"), "Added");
80/// ```
81#[derive(Debug, Clone, PartialEq)]
82pub enum ChangeStatus {
83    /// New file.
84    Added,
85    /// Existing file modified.
86    Modified,
87    /// File removed.
88    Deleted,
89    /// File renamed from another path.
90    Renamed {
91        /// Original path before rename.
92        from: String,
93    },
94}
95
96/// Options for history mining.
97///
98/// # Examples
99///
100/// ```
101/// use argus_gitpulse::mining::MiningOptions;
102///
103/// let opts = MiningOptions::default();
104/// assert_eq!(opts.since_days, 180);
105/// assert_eq!(opts.max_files_per_commit, 25);
106/// ```
107pub struct MiningOptions {
108    /// Only include commits from the last N days (default: 180).
109    pub since_days: u64,
110    /// Skip commits touching more files than this (default: 25).
111    pub max_files_per_commit: usize,
112    /// Branch to walk (default: HEAD).
113    pub branch: Option<String>,
114}
115
116impl Default for MiningOptions {
117    fn default() -> Self {
118        Self {
119            since_days: 180,
120            max_files_per_commit: 25,
121            branch: None,
122        }
123    }
124}
125
126/// Mine commit history from a git repository.
127///
128/// Returns commits in reverse chronological order (newest first).
129/// Skips merge commits with more files than `max_files_per_commit`.
130///
131/// # Errors
132///
133/// Returns [`ArgusError::Git`] if the repository cannot be opened or walked.
134///
135/// # Examples
136///
137/// ```no_run
138/// use std::path::Path;
139/// use argus_gitpulse::mining::{mine_history, MiningOptions};
140///
141/// let commits = mine_history(Path::new("."), &MiningOptions::default()).unwrap();
142/// for c in &commits {
143///     println!("{}: {} ({})", &c.hash[..7], c.message, c.author);
144/// }
145/// ```
146pub fn mine_history(
147    repo_path: &Path,
148    options: &MiningOptions,
149) -> Result<Vec<CommitInfo>, ArgusError> {
150    let repo = Repository::open(repo_path)
151        .map_err(|e| ArgusError::Git(format!("failed to open repository: {e}")))?;
152
153    let mut revwalk = repo
154        .revwalk()
155        .map_err(|e| ArgusError::Git(format!("failed to create revwalk: {e}")))?;
156
157    revwalk.set_sorting(Sort::TIME).ok();
158
159    // Start from HEAD or specified branch
160    if let Some(ref branch) = options.branch {
161        let reference = repo
162            .resolve_reference_from_short_name(branch)
163            .map_err(|e| ArgusError::Git(format!("failed to resolve branch '{branch}': {e}")))?;
164        let oid = reference
165            .target()
166            .ok_or_else(|| ArgusError::Git("branch has no target".into()))?;
167        revwalk
168            .push(oid)
169            .map_err(|e| ArgusError::Git(format!("failed to push oid: {e}")))?;
170    } else {
171        revwalk
172            .push_head()
173            .map_err(|e| ArgusError::Git(format!("failed to push HEAD: {e}")))?;
174    }
175
176    let cutoff = compute_cutoff(options.since_days);
177    let mut commits = Vec::new();
178
179    for oid_result in revwalk {
180        let oid = oid_result.map_err(|e| ArgusError::Git(format!("revwalk error: {e}")))?;
181
182        let commit = repo
183            .find_commit(oid)
184            .map_err(|e| ArgusError::Git(format!("failed to find commit: {e}")))?;
185
186        let timestamp = commit.time().seconds();
187        if timestamp < cutoff {
188            break;
189        }
190
191        // Skip merge commits with too many parents (unless they have few file changes)
192        let parent_count = commit.parent_count();
193        if parent_count > 1 {
194            // Check file count before skipping
195            let file_count = count_diff_files(&repo, &commit)?;
196            if file_count > options.max_files_per_commit {
197                continue;
198            }
199        }
200
201        let files_changed = extract_file_changes(&repo, &commit)?;
202
203        // Skip commits with too many files (large refactors)
204        if files_changed.len() > options.max_files_per_commit {
205            continue;
206        }
207
208        let author = commit.author();
209        let hash = oid.to_string();
210
211        commits.push(CommitInfo {
212            hash: hash[..hash.len().min(8)].to_string(),
213            author: author.name().unwrap_or("unknown").to_string(),
214            email: author.email().unwrap_or("unknown").to_string(),
215            timestamp,
216            message: commit
217                .message()
218                .unwrap_or("")
219                .lines()
220                .next()
221                .unwrap_or("")
222                .to_string(),
223            files_changed,
224        });
225    }
226
227    Ok(commits)
228}
229
230fn compute_cutoff(since_days: u64) -> i64 {
231    let now = std::time::SystemTime::now()
232        .duration_since(std::time::UNIX_EPOCH)
233        .unwrap_or_default()
234        .as_secs() as i64;
235    now - (since_days as i64 * 86400)
236}
237
238fn count_diff_files(repo: &Repository, commit: &git2::Commit) -> Result<usize, ArgusError> {
239    let commit_tree = commit
240        .tree()
241        .map_err(|e| ArgusError::Git(format!("failed to get commit tree: {e}")))?;
242
243    let parent_tree = if commit.parent_count() > 0 {
244        let parent = commit
245            .parent(0)
246            .map_err(|e| ArgusError::Git(format!("failed to get parent: {e}")))?;
247        Some(
248            parent
249                .tree()
250                .map_err(|e| ArgusError::Git(format!("failed to get parent tree: {e}")))?,
251        )
252    } else {
253        None
254    };
255
256    let mut diff_opts = DiffOptions::new();
257    let diff = repo
258        .diff_tree_to_tree(
259            parent_tree.as_ref(),
260            Some(&commit_tree),
261            Some(&mut diff_opts),
262        )
263        .map_err(|e| ArgusError::Git(format!("failed to compute diff: {e}")))?;
264
265    Ok(diff.deltas().len())
266}
267
268fn extract_file_changes(
269    repo: &Repository,
270    commit: &git2::Commit,
271) -> Result<Vec<FileChange>, ArgusError> {
272    let commit_tree = commit
273        .tree()
274        .map_err(|e| ArgusError::Git(format!("failed to get commit tree: {e}")))?;
275
276    let parent_tree = if commit.parent_count() > 0 {
277        let parent = commit
278            .parent(0)
279            .map_err(|e| ArgusError::Git(format!("failed to get parent: {e}")))?;
280        Some(
281            parent
282                .tree()
283                .map_err(|e| ArgusError::Git(format!("failed to get parent tree: {e}")))?,
284        )
285    } else {
286        None
287    };
288
289    let mut diff_opts = DiffOptions::new();
290    let diff = repo
291        .diff_tree_to_tree(
292            parent_tree.as_ref(),
293            Some(&commit_tree),
294            Some(&mut diff_opts),
295        )
296        .map_err(|e| ArgusError::Git(format!("failed to compute diff: {e}")))?;
297
298    // Enable rename detection
299    let mut find_opts = git2::DiffFindOptions::new();
300    find_opts.renames(true);
301    let mut diff = diff;
302    diff.find_similar(Some(&mut find_opts))
303        .map_err(|e| ArgusError::Git(format!("failed to find renames: {e}")))?;
304
305    let mut changes = Vec::new();
306    let num_deltas = diff.deltas().len();
307
308    for delta_idx in 0..num_deltas {
309        let delta = diff.get_delta(delta_idx).unwrap();
310
311        let new_file = delta.new_file();
312        let path = new_file
313            .path()
314            .unwrap_or(Path::new(""))
315            .to_string_lossy()
316            .to_string();
317
318        if path.is_empty() {
319            continue;
320        }
321
322        let status = match delta.status() {
323            Delta::Added => ChangeStatus::Added,
324            Delta::Deleted => {
325                let old_path = delta
326                    .old_file()
327                    .path()
328                    .unwrap_or(Path::new(""))
329                    .to_string_lossy()
330                    .to_string();
331                // Use old path for deleted files
332                changes.push(FileChange {
333                    path: old_path,
334                    lines_added: 0,
335                    lines_deleted: 0,
336                    status: ChangeStatus::Deleted,
337                });
338                continue;
339            }
340            Delta::Modified => ChangeStatus::Modified,
341            Delta::Renamed => {
342                let old_path = delta
343                    .old_file()
344                    .path()
345                    .unwrap_or(Path::new(""))
346                    .to_string_lossy()
347                    .to_string();
348                ChangeStatus::Renamed { from: old_path }
349            }
350            _ => ChangeStatus::Modified,
351        };
352
353        changes.push(FileChange {
354            path,
355            lines_added: 0,
356            lines_deleted: 0,
357            status,
358        });
359    }
360
361    // Count lines added/deleted per file using foreach
362    let mut line_counts: std::collections::HashMap<String, (u64, u64)> =
363        std::collections::HashMap::new();
364
365    diff.foreach(
366        &mut |_delta, _progress| true,
367        None,
368        None,
369        Some(&mut |delta, _hunk, line| {
370            let path = delta
371                .new_file()
372                .path()
373                .or_else(|| delta.old_file().path())
374                .unwrap_or(Path::new(""))
375                .to_string_lossy()
376                .to_string();
377
378            let entry = line_counts.entry(path).or_insert((0, 0));
379            match line.origin() {
380                '+' => entry.0 += 1,
381                '-' => entry.1 += 1,
382                _ => {}
383            }
384            true
385        }),
386    )
387    .map_err(|e| ArgusError::Git(format!("failed to iterate diff lines: {e}")))?;
388
389    // Apply line counts to changes
390    for change in &mut changes {
391        if let Some((added, deleted)) = line_counts.get(&change.path) {
392            change.lines_added = *added;
393            change.lines_deleted = *deleted;
394        }
395    }
396
397    Ok(changes)
398}
399
400#[cfg(test)]
401mod tests {
402    use super::*;
403
404    #[test]
405    fn mining_options_defaults_are_correct() {
406        let opts = MiningOptions::default();
407        assert_eq!(opts.since_days, 180);
408        assert_eq!(opts.max_files_per_commit, 25);
409        assert!(opts.branch.is_none());
410    }
411
412    #[test]
413    fn mine_argus_repo_returns_commits() {
414        // Find the repo root (this test runs from crate dir or workspace root)
415        let repo_path = find_repo_root().expect("should find repo root");
416        let opts = MiningOptions {
417            since_days: 365,
418            ..MiningOptions::default()
419        };
420        let commits = mine_history(&repo_path, &opts).unwrap();
421        assert!(!commits.is_empty(), "argus repo should have commits");
422        // Verify basic structure
423        let first = &commits[0];
424        assert!(!first.hash.is_empty());
425        assert!(!first.author.is_empty());
426        assert!(first.timestamp > 0);
427    }
428
429    #[test]
430    fn large_commits_are_skipped() {
431        let repo_path = find_repo_root().expect("should find repo root");
432        let opts = MiningOptions {
433            since_days: 365,
434            max_files_per_commit: 2, // Very small threshold
435            ..MiningOptions::default()
436        };
437        let commits = mine_history(&repo_path, &opts).unwrap();
438        // All returned commits should have <= 2 files
439        for commit in &commits {
440            assert!(
441                commit.files_changed.len() <= 2,
442                "commit {} has {} files, expected <= 2",
443                commit.hash,
444                commit.files_changed.len()
445            );
446        }
447    }
448
449    #[test]
450    fn change_status_identifies_correctly() {
451        let added = ChangeStatus::Added;
452        let modified = ChangeStatus::Modified;
453        let deleted = ChangeStatus::Deleted;
454        let renamed = ChangeStatus::Renamed {
455            from: "old.rs".into(),
456        };
457
458        assert_eq!(added, ChangeStatus::Added);
459        assert_eq!(modified, ChangeStatus::Modified);
460        assert_eq!(deleted, ChangeStatus::Deleted);
461        assert_ne!(renamed, ChangeStatus::Modified);
462    }
463
464    fn find_repo_root() -> Option<std::path::PathBuf> {
465        let mut path = std::env::current_dir().ok()?;
466        loop {
467            if path.join(".git").exists() {
468                return Some(path);
469            }
470            if !path.pop() {
471                return None;
472            }
473        }
474    }
475}