codemem_engine/enrichment/
git.rs1use super::EnrichResult;
4use crate::CodememEngine;
5use codemem_core::{CodememError, Edge, GraphBackend, RelationshipType};
6use serde_json::json;
7use std::collections::{HashMap, HashSet};
8
9impl CodememEngine {
10 pub fn enrich_git_history(
12 &self,
13 path: &str,
14 days: u64,
15 namespace: Option<&str>,
16 ) -> Result<EnrichResult, CodememError> {
17 let output = std::process::Command::new("git")
19 .args([
20 "-C",
21 path,
22 "log",
23 "--format=COMMIT:%H|%an|%aI",
24 "--name-only",
25 &format!("--since={days} days ago"),
26 ])
27 .output()
28 .map_err(|e| CodememError::Internal(format!("Failed to run git: {e}")))?;
29
30 if !output.status.success() {
31 let stderr = String::from_utf8_lossy(&output.stderr);
32 return Err(CodememError::Internal(format!("git log failed: {stderr}")));
33 }
34
35 let stdout = String::from_utf8_lossy(&output.stdout);
36
37 struct Commit {
39 author: String,
40 date: chrono::DateTime<chrono::Utc>,
41 files: Vec<String>,
42 }
43
44 let mut commits: Vec<Commit> = Vec::new();
45
46 for block in stdout.split("COMMIT:").skip(1) {
47 let mut lines = block.lines();
48 if let Some(header) = lines.next() {
49 let parts: Vec<&str> = header.splitn(3, '|').collect();
50 if parts.len() >= 3 {
51 let author = parts[1].to_string();
52 let date = chrono::DateTime::parse_from_rfc3339(parts[2])
53 .map(|dt| dt.with_timezone(&chrono::Utc))
54 .unwrap_or_else(|_| chrono::Utc::now());
55 let files: Vec<String> = lines
56 .filter(|l| !l.trim().is_empty())
57 .map(|l| l.trim().to_string())
58 .collect();
59 if !files.is_empty() {
60 commits.push(Commit {
61 author,
62 date,
63 files,
64 });
65 }
66 }
67 }
68 }
69
70 let total_commits = commits.len();
71
72 struct FileStats {
74 commit_count: usize,
75 authors: HashSet<String>,
76 }
77
78 let mut file_stats: HashMap<String, FileStats> = HashMap::new();
79 let mut author_file_count: HashMap<String, usize> = HashMap::new();
80 let mut author_commit_count: HashMap<String, usize> = HashMap::new();
81
82 struct CoChangeInfo {
84 count: usize,
85 earliest: chrono::DateTime<chrono::Utc>,
86 latest: chrono::DateTime<chrono::Utc>,
87 }
88 let mut co_change_info: HashMap<(String, String), CoChangeInfo> = HashMap::new();
89
90 for commit in &commits {
91 *author_commit_count
92 .entry(commit.author.clone())
93 .or_default() += 1;
94
95 for file in &commit.files {
96 let stats = file_stats.entry(file.clone()).or_insert(FileStats {
97 commit_count: 0,
98 authors: HashSet::new(),
99 });
100 stats.commit_count += 1;
101 stats.authors.insert(commit.author.clone());
102 *author_file_count.entry(commit.author.clone()).or_default() += 1;
103 }
104
105 let mut sorted_files: Vec<&String> = commit.files.iter().collect();
108 if sorted_files.len() > 50 {
109 continue;
110 }
111 sorted_files.sort();
112 for i in 0..sorted_files.len() {
113 for j in (i + 1)..sorted_files.len() {
114 let key = (sorted_files[i].clone(), sorted_files[j].clone());
115 let entry = co_change_info.entry(key).or_insert(CoChangeInfo {
116 count: 0,
117 earliest: commit.date,
118 latest: commit.date,
119 });
120 entry.count += 1;
121 if commit.date < entry.earliest {
122 entry.earliest = commit.date;
123 }
124 if commit.date > entry.latest {
125 entry.latest = commit.date;
126 }
127 }
128 }
129 }
130
131 let mut files_annotated = 0;
133 {
134 let mut graph = self.lock_graph()?;
135
136 for (file_path, stats) in &file_stats {
137 let node_id = format!("file:{file_path}");
138 if let Ok(Some(mut node)) = graph.get_node(&node_id) {
139 node.payload
140 .insert("git_commit_count".into(), json!(stats.commit_count));
141 node.payload
142 .insert("git_authors".into(), json!(stats.authors));
143 let churn_rate = if days > 0 {
144 stats.commit_count as f64 / (days as f64 / 30.0)
145 } else {
146 0.0
147 };
148 node.payload
149 .insert("git_churn_rate".into(), json!(churn_rate));
150 let _ = graph.add_node(node);
151 files_annotated += 1;
152 }
153 }
154 }
155
156 let co_change_threshold = 2;
158 let mut co_change_edges_created = 0;
159 {
160 let mut graph = self.lock_graph()?;
161
162 for ((file_a, file_b), info) in &co_change_info {
163 if info.count < co_change_threshold {
164 continue;
165 }
166 let src_id = format!("file:{file_a}");
167 let dst_id = format!("file:{file_b}");
168
169 if graph.get_node(&src_id).ok().flatten().is_none()
170 || graph.get_node(&dst_id).ok().flatten().is_none()
171 {
172 continue;
173 }
174
175 let weight = if total_commits > 0 {
176 info.count as f64 / total_commits as f64
177 } else {
178 0.0
179 };
180
181 let edge = Edge {
182 id: format!("cochange:{}:{}", file_a, file_b),
183 src: src_id,
184 dst: dst_id,
185 relationship: RelationshipType::CoChanged,
186 weight,
187 properties: HashMap::from([("commit_count".into(), json!(info.count))]),
188 created_at: chrono::Utc::now(),
189 valid_from: Some(info.earliest),
190 valid_to: Some(info.latest),
191 };
192 let _ = self.storage.insert_graph_edge(&edge);
193 if graph.add_edge(edge).is_ok() {
194 co_change_edges_created += 1;
195 }
196 }
197 }
198
199 let mut insights_stored = 0;
201
202 for (file_path, stats) in &file_stats {
204 if stats.commit_count > self.config.enrichment.git_min_commit_count {
205 let mut sorted_authors: Vec<_> = stats.authors.iter().collect();
206 sorted_authors.sort();
207 let authors_str = sorted_authors
208 .iter()
209 .map(|s| s.as_str())
210 .collect::<Vec<_>>()
211 .join(", ");
212 let content = format!(
213 "High activity: {} — {} commits in the last {} days by {}",
214 file_path, stats.commit_count, days, authors_str
215 );
216 let importance = (stats.commit_count as f64 / 100.0).clamp(0.2, 0.6);
217 if self
218 .store_insight(
219 &content,
220 "activity",
221 &["git-history"],
222 importance,
223 namespace,
224 &[format!("file:{file_path}")],
225 )
226 .is_some()
227 {
228 insights_stored += 1;
229 }
230 }
231 }
232
233 for ((file_a, file_b), info) in &co_change_info {
235 if info.count >= self.config.enrichment.git_min_co_change_count {
236 let content = format!(
237 "Co-change pattern: {} and {} change together in {} commits — likely coupled",
238 file_a, file_b, info.count
239 );
240 if self
241 .store_insight(
242 &content,
243 "activity",
244 &["git-history", "coupling"],
245 0.4,
246 namespace,
247 &[format!("file:{file_a}"), format!("file:{file_b}")],
248 )
249 .is_some()
250 {
251 insights_stored += 1;
252 }
253 }
254 }
255
256 let mut author_vec: Vec<_> = author_commit_count.iter().collect();
258 author_vec.sort_by(|a, b| b.1.cmp(a.1));
259 for (author, commit_count) in author_vec.iter().take(3) {
260 let file_count = author_file_count.get(*author).unwrap_or(&0);
261 let content = format!(
262 "Most active contributor: {} with {} commits across {} files",
263 author, commit_count, file_count
264 );
265 if self
266 .store_insight(
267 &content,
268 "activity",
269 &["git-history", "contributor"],
270 0.5,
271 namespace,
272 &[],
273 )
274 .is_some()
275 {
276 insights_stored += 1;
277 }
278 }
279
280 self.save_index();
281
282 Ok(EnrichResult {
283 insights_stored,
284 details: json!({
285 "total_commits": total_commits,
286 "files_annotated": files_annotated,
287 "co_change_edges_created": co_change_edges_created,
288 "insights_stored": insights_stored,
289 "unique_authors": author_commit_count.len(),
290 }),
291 })
292 }
293}