Skip to main content

cgx_engine/
git.rs

1use std::collections::HashMap;
2use std::path::Path;
3
4use git2::{Repository, Sort};
5
6type OwnerList = Vec<(String, String, f64)>;
7type CoChangeList = Vec<(String, String, f64)>;
8
9/// Results of analysing a repository's git history.
10pub struct GitAnalysis {
11    /// Normalised churn score per file path (0.0–1.0) over the last 90 days.
12    pub file_churn: HashMap<String, f64>,
13    /// Top-3 authors per file: `(name, email, ownership_fraction)`.
14    pub file_owners: HashMap<String, OwnerList>,
15    /// File pairs that changed together, with normalised co-change score (0.0–1.0).
16    pub co_changes: CoChangeList,
17}
18
19/// Analyse a repository's git history to extract churn, ownership, and co-change data.
20///
21/// - **Churn** — commit frequency per file over the last 90 days, normalised to 0.0–1.0.
22/// - **Ownership** — top-3 authors by line count via `git blame` for each path in `file_paths`.
23/// - **Co-changes** — file pairs that appear together in the same commit over the last year.
24pub fn analyze_repo(repo_path: &Path, file_paths: &[String]) -> anyhow::Result<GitAnalysis> {
25    let repo = Repository::open(repo_path)?;
26
27    let (file_churn, co_changes) = compute_churn_and_co_changes(&repo)?;
28    let file_owners = compute_blame(&repo, file_paths)?;
29
30    Ok(GitAnalysis {
31        file_churn,
32        file_owners,
33        co_changes,
34    })
35}
36
37fn compute_churn_and_co_changes(
38    repo: &Repository,
39) -> anyhow::Result<(HashMap<String, f64>, CoChangeList)> {
40    // 90 days for churn scoring; 365 days for co-change pairs (stable repos commit infrequently)
41    let churn_cutoff = chrono::Utc::now().timestamp() - 90 * 86400;
42    let co_change_cutoff = chrono::Utc::now().timestamp() - 365 * 86400;
43
44    let mut commit_counts: HashMap<String, u32> = HashMap::new();
45    let mut pair_counts: HashMap<(String, String), u32> = HashMap::new();
46    let mut max_churn: u32 = 0;
47    let mut max_co: u32 = 0;
48
49    let mut revwalk = repo.revwalk()?;
50    revwalk.push_head()?;
51    revwalk.set_sorting(Sort::TIME)?;
52
53    for oid_result in revwalk {
54        let oid = match oid_result {
55            Ok(o) => o,
56            Err(_) => continue,
57        };
58        let commit = match repo.find_commit(oid) {
59            Ok(c) => c,
60            Err(_) => continue,
61        };
62
63        let commit_ts = commit.time().seconds();
64        if commit_ts < co_change_cutoff {
65            break;
66        }
67
68        let commit_tree = match commit.tree() {
69            Ok(t) => t,
70            Err(_) => continue,
71        };
72
73        let mut parent_tree = None;
74        if let Ok(parent) = commit.parent(0) {
75            if let Ok(tree) = parent.tree() {
76                parent_tree = Some(tree);
77            }
78        }
79
80        let diff = repo.diff_tree_to_tree(parent_tree.as_ref(), Some(&commit_tree), None)?;
81
82        let mut changed_files: Vec<String> = Vec::new();
83
84        diff.foreach(
85            &mut |delta, _| {
86                if let Some(path) = delta.new_file().path() {
87                    if let Some(s) = path.to_str() {
88                        changed_files.push(s.to_string());
89                    }
90                }
91                true
92            },
93            None,
94            None,
95            None,
96        )?;
97
98        changed_files.sort();
99        changed_files.dedup();
100
101        // Only count churn for commits within the 90-day window
102        if commit_ts >= churn_cutoff {
103            for file in &changed_files {
104                let count = commit_counts.entry(file.clone()).or_insert(0);
105                *count += 1;
106                if *count > max_churn {
107                    max_churn = *count;
108                }
109            }
110        }
111
112        for i in 0..changed_files.len() {
113            for j in (i + 1)..changed_files.len() {
114                let pair = (changed_files[i].clone(), changed_files[j].clone());
115                let count = pair_counts.entry(pair).or_insert(0);
116                *count += 1;
117                if *count > max_co {
118                    max_co = *count;
119                }
120            }
121        }
122    }
123
124    let mut churn_map = HashMap::new();
125    if max_churn > 0 {
126        for (file, count) in commit_counts {
127            churn_map.insert(file, count as f64 / max_churn as f64);
128        }
129    }
130
131    let mut co_results: CoChangeList = Vec::new();
132    let min_co_count = 2u32;
133    if max_co > 0 {
134        for ((a, b), count) in pair_counts {
135            if count >= min_co_count {
136                co_results.push((a, b, count as f64 / max_co as f64));
137            }
138        }
139    }
140
141    Ok((churn_map, co_results))
142}
143
144fn compute_blame(
145    repo: &Repository,
146    file_paths: &[String],
147) -> anyhow::Result<HashMap<String, OwnerList>> {
148    let mut owners: HashMap<String, OwnerList> = HashMap::new();
149
150    for file_path in file_paths {
151        let blame = match repo.blame_file(std::path::Path::new(file_path), None) {
152            Ok(b) => b,
153            Err(e) => {
154                tracing::debug!("blame failed for {}: {}", file_path, e);
155                continue;
156            }
157        };
158
159        let mut author_lines: HashMap<String, (String, u32)> = HashMap::new();
160        let mut total_lines: u32 = 0;
161
162        for hunk in blame.iter() {
163            let sig = hunk.final_signature();
164            let name = sig.name().unwrap_or("unknown").to_string();
165            let email = sig.email().unwrap_or("unknown").to_string();
166            let lines = hunk.lines_in_hunk() as u32;
167
168            let key = email.clone();
169            let entry = author_lines.entry(key).or_insert((name, 0));
170            entry.1 += lines;
171            total_lines += lines;
172        }
173
174        if total_lines > 0 {
175            let mut file_owners: Vec<(String, String, f64)> = author_lines
176                .into_iter()
177                .map(|(email, (name, lines))| (name, email, lines as f64 / total_lines as f64))
178                .collect();
179            file_owners.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap_or(std::cmp::Ordering::Equal));
180            file_owners.truncate(3);
181            owners.insert(file_path.clone(), file_owners);
182        }
183    }
184
185    Ok(owners)
186}