Skip to main content

codemem_engine/enrichment/
git.rs

1//! Git history enrichment: file activity, co-changes, contributors.
2
3use super::EnrichResult;
4use crate::CodememEngine;
5use codemem_core::{CodememError, Edge, GraphBackend, RelationshipType};
6use serde_json::json;
7use std::collections::{HashMap, HashSet};
8
9impl CodememEngine {
10    /// Enrich the graph with git history analysis: file activity, co-changes, contributors.
11    pub fn enrich_git_history(
12        &self,
13        path: &str,
14        days: u64,
15        namespace: Option<&str>,
16    ) -> Result<EnrichResult, CodememError> {
17        // Run git log
18        let output = std::process::Command::new("git")
19            .args([
20                "-C",
21                path,
22                "log",
23                "--format=COMMIT:%H|%an|%aI",
24                "--name-only",
25                &format!("--since={days} days ago"),
26            ])
27            .output()
28            .map_err(|e| CodememError::Internal(format!("Failed to run git: {e}")))?;
29
30        if !output.status.success() {
31            let stderr = String::from_utf8_lossy(&output.stderr);
32            return Err(CodememError::Internal(format!("git log failed: {stderr}")));
33        }
34
35        let stdout = String::from_utf8_lossy(&output.stdout);
36
37        // Parse commits
38        struct Commit {
39            author: String,
40            date: chrono::DateTime<chrono::Utc>,
41            files: Vec<String>,
42        }
43
44        let mut commits: Vec<Commit> = Vec::new();
45
46        for block in stdout.split("COMMIT:").skip(1) {
47            let mut lines = block.lines();
48            if let Some(header) = lines.next() {
49                let parts: Vec<&str> = header.splitn(3, '|').collect();
50                if parts.len() >= 3 {
51                    let author = parts[1].to_string();
52                    let date = chrono::DateTime::parse_from_rfc3339(parts[2])
53                        .map(|dt| dt.with_timezone(&chrono::Utc))
54                        .unwrap_or_else(|_| chrono::Utc::now());
55                    let files: Vec<String> = lines
56                        .filter(|l| !l.trim().is_empty())
57                        .map(|l| l.trim().to_string())
58                        .collect();
59                    if !files.is_empty() {
60                        commits.push(Commit {
61                            author,
62                            date,
63                            files,
64                        });
65                    }
66                }
67            }
68        }
69
70        let total_commits = commits.len();
71
72        // Aggregate per-file stats
73        struct FileStats {
74            commit_count: usize,
75            authors: HashSet<String>,
76        }
77
78        let mut file_stats: HashMap<String, FileStats> = HashMap::new();
79        let mut author_file_count: HashMap<String, usize> = HashMap::new();
80        let mut author_commit_count: HashMap<String, usize> = HashMap::new();
81
82        // Co-change tracking with temporal info
83        struct CoChangeInfo {
84            count: usize,
85            earliest: chrono::DateTime<chrono::Utc>,
86            latest: chrono::DateTime<chrono::Utc>,
87        }
88        let mut co_change_info: HashMap<(String, String), CoChangeInfo> = HashMap::new();
89
90        for commit in &commits {
91            *author_commit_count
92                .entry(commit.author.clone())
93                .or_default() += 1;
94
95            for file in &commit.files {
96                let stats = file_stats.entry(file.clone()).or_insert(FileStats {
97                    commit_count: 0,
98                    authors: HashSet::new(),
99                });
100                stats.commit_count += 1;
101                stats.authors.insert(commit.author.clone());
102                *author_file_count.entry(commit.author.clone()).or_default() += 1;
103            }
104
105            // Track co-changes (pairs of files in same commit)
106            // Skip bulk refactor commits (>50 files) to avoid O(N^2) explosion
107            let mut sorted_files: Vec<&String> = commit.files.iter().collect();
108            if sorted_files.len() > 50 {
109                continue;
110            }
111            sorted_files.sort();
112            for i in 0..sorted_files.len() {
113                for j in (i + 1)..sorted_files.len() {
114                    let key = (sorted_files[i].clone(), sorted_files[j].clone());
115                    let entry = co_change_info.entry(key).or_insert(CoChangeInfo {
116                        count: 0,
117                        earliest: commit.date,
118                        latest: commit.date,
119                    });
120                    entry.count += 1;
121                    if commit.date < entry.earliest {
122                        entry.earliest = commit.date;
123                    }
124                    if commit.date > entry.latest {
125                        entry.latest = commit.date;
126                    }
127                }
128            }
129        }
130
131        // Annotate graph nodes with git stats
132        let mut files_annotated = 0;
133        {
134            let mut graph = self.lock_graph()?;
135
136            for (file_path, stats) in &file_stats {
137                let node_id = format!("file:{file_path}");
138                if let Ok(Some(mut node)) = graph.get_node(&node_id) {
139                    node.payload
140                        .insert("git_commit_count".into(), json!(stats.commit_count));
141                    node.payload
142                        .insert("git_authors".into(), json!(stats.authors));
143                    let churn_rate = if days > 0 {
144                        stats.commit_count as f64 / (days as f64 / 30.0)
145                    } else {
146                        0.0
147                    };
148                    node.payload
149                        .insert("git_churn_rate".into(), json!(churn_rate));
150                    let _ = graph.add_node(node);
151                    files_annotated += 1;
152                }
153            }
154        }
155
156        // Create co-change edges (threshold: 2+ co-occurrences) with temporal data
157        let co_change_threshold = 2;
158        let mut co_change_edges_created = 0;
159        {
160            let mut graph = self.lock_graph()?;
161
162            for ((file_a, file_b), info) in &co_change_info {
163                if info.count < co_change_threshold {
164                    continue;
165                }
166                let src_id = format!("file:{file_a}");
167                let dst_id = format!("file:{file_b}");
168
169                if graph.get_node(&src_id).ok().flatten().is_none()
170                    || graph.get_node(&dst_id).ok().flatten().is_none()
171                {
172                    continue;
173                }
174
175                let weight = if total_commits > 0 {
176                    info.count as f64 / total_commits as f64
177                } else {
178                    0.0
179                };
180
181                let edge = Edge {
182                    id: format!("cochange:{}:{}", file_a, file_b),
183                    src: src_id,
184                    dst: dst_id,
185                    relationship: RelationshipType::CoChanged,
186                    weight,
187                    properties: HashMap::from([("commit_count".into(), json!(info.count))]),
188                    created_at: chrono::Utc::now(),
189                    valid_from: Some(info.earliest),
190                    valid_to: Some(info.latest),
191                };
192                let _ = self.storage.insert_graph_edge(&edge);
193                if graph.add_edge(edge).is_ok() {
194                    co_change_edges_created += 1;
195                }
196            }
197        }
198
199        // Store insights
200        let mut insights_stored = 0;
201
202        // High-activity files
203        for (file_path, stats) in &file_stats {
204            if stats.commit_count > self.config.enrichment.git_min_commit_count {
205                let mut sorted_authors: Vec<_> = stats.authors.iter().collect();
206                sorted_authors.sort();
207                let authors_str = sorted_authors
208                    .iter()
209                    .map(|s| s.as_str())
210                    .collect::<Vec<_>>()
211                    .join(", ");
212                let content = format!(
213                    "High activity: {} — {} commits in the last {} days by {}",
214                    file_path, stats.commit_count, days, authors_str
215                );
216                let importance = (stats.commit_count as f64 / 100.0).clamp(0.2, 0.6);
217                if self
218                    .store_insight(
219                        &content,
220                        "activity",
221                        &["git-history"],
222                        importance,
223                        namespace,
224                        &[format!("file:{file_path}")],
225                    )
226                    .is_some()
227                {
228                    insights_stored += 1;
229                }
230            }
231        }
232
233        // Co-change patterns
234        for ((file_a, file_b), info) in &co_change_info {
235            if info.count >= self.config.enrichment.git_min_co_change_count {
236                let content = format!(
237                    "Co-change pattern: {} and {} change together in {} commits — likely coupled",
238                    file_a, file_b, info.count
239                );
240                if self
241                    .store_insight(
242                        &content,
243                        "activity",
244                        &["git-history", "coupling"],
245                        0.4,
246                        namespace,
247                        &[format!("file:{file_a}"), format!("file:{file_b}")],
248                    )
249                    .is_some()
250                {
251                    insights_stored += 1;
252                }
253            }
254        }
255
256        // Most active contributors
257        let mut author_vec: Vec<_> = author_commit_count.iter().collect();
258        author_vec.sort_by(|a, b| b.1.cmp(a.1));
259        for (author, commit_count) in author_vec.iter().take(3) {
260            let file_count = author_file_count.get(*author).unwrap_or(&0);
261            let content = format!(
262                "Most active contributor: {} with {} commits across {} files",
263                author, commit_count, file_count
264            );
265            if self
266                .store_insight(
267                    &content,
268                    "activity",
269                    &["git-history", "contributor"],
270                    0.5,
271                    namespace,
272                    &[],
273                )
274                .is_some()
275            {
276                insights_stored += 1;
277            }
278        }
279
280        self.save_index();
281
282        Ok(EnrichResult {
283            insights_stored,
284            details: json!({
285                "total_commits": total_commits,
286                "files_annotated": files_annotated,
287                "co_change_edges_created": co_change_edges_created,
288                "insights_stored": insights_stored,
289                "unique_authors": author_commit_count.len(),
290            }),
291        })
292    }
293}