Skip to main content

lean_ctx/core/
graph_enricher.rs

1//! Unified Graph Enricher — indexes Git history, tests, and knowledge into the PropertyGraph.
2//!
3//! Three enrichment passes:
4//! 1. **Git commits**: `git log` → Commit nodes + `changed_in` edges
5//! 2. **Test files**: naming/annotation heuristics → Test nodes + `tested_by` edges
6//! 3. **Knowledge bridge**: `ctx_knowledge` facts → Knowledge nodes + `mentioned_in` edges
7
8use crate::core::property_graph::{CodeGraph, Edge, EdgeKind, Node};
9use std::collections::HashSet;
10use std::path::Path;
11
12// ---------------------------------------------------------------------------
13// Git History Indexer
14// ---------------------------------------------------------------------------
15
16#[derive(Debug, Clone)]
17pub struct CommitInfo {
18    pub hash: String,
19    pub short_hash: String,
20    pub author: String,
21    pub date: String,
22    pub message: String,
23    pub files_changed: Vec<String>,
24}
25
26pub fn index_git_history(
27    graph: &CodeGraph,
28    project_root: &Path,
29    max_commits: usize,
30) -> anyhow::Result<EnrichmentStats> {
31    let mut stats = EnrichmentStats::default();
32
33    let output = std::process::Command::new("git")
34        .args([
35            "log",
36            &format!("-{max_commits}"),
37            "--format=%H%n%h%n%an%n%ai%n%s",
38            "--name-only",
39        ])
40        .current_dir(project_root)
41        .output();
42
43    let output = match output {
44        Ok(o) if o.status.success() => String::from_utf8_lossy(&o.stdout).to_string(),
45        _ => return Ok(stats),
46    };
47
48    let commits = parse_git_log(&output);
49    for commit in &commits {
50        let commit_node =
51            Node::commit(&commit.short_hash, &commit.message).with_metadata(&format!(
52                "{{\"author\":\"{}\",\"date\":\"{}\",\"hash\":\"{}\"}}",
53                commit.author, commit.date, commit.hash
54            ));
55
56        let commit_id = graph.upsert_node(&commit_node)?;
57        stats.commits_indexed += 1;
58
59        for file in &commit.files_changed {
60            if let Some(file_node) = graph.get_node_by_path(file)? {
61                if let Some(file_id) = file_node.id {
62                    graph.upsert_edge(&Edge::new(file_id, commit_id, EdgeKind::ChangedIn))?;
63                    stats.edges_created += 1;
64                }
65            }
66        }
67    }
68
69    Ok(stats)
70}
71
72fn parse_git_log(output: &str) -> Vec<CommitInfo> {
73    let mut commits = Vec::new();
74    let mut lines = output.lines().peekable();
75
76    while lines.peek().is_some() {
77        let hash = match lines.next() {
78            Some(h) if !h.is_empty() && h.len() >= 7 => h.to_string(),
79            _ => {
80                lines.next();
81                continue;
82            }
83        };
84
85        let short_hash = match lines.next() {
86            Some(s) => s.to_string(),
87            None => break,
88        };
89        let author = match lines.next() {
90            Some(a) => a.to_string(),
91            None => break,
92        };
93        let date = match lines.next() {
94            Some(d) => d.to_string(),
95            None => break,
96        };
97        let message = match lines.next() {
98            Some(m) => m.to_string(),
99            None => break,
100        };
101
102        let mut files_changed = Vec::new();
103        while let Some(line) = lines.peek() {
104            if line.is_empty() {
105                lines.next();
106                break;
107            }
108            files_changed.push(line.to_string());
109            lines.next();
110        }
111
112        commits.push(CommitInfo {
113            hash,
114            short_hash,
115            author,
116            date,
117            message,
118            files_changed,
119        });
120    }
121
122    commits
123}
124
125// ---------------------------------------------------------------------------
126// Test Indexer
127// ---------------------------------------------------------------------------
128
129const TEST_PATTERNS: &[&str] = &[
130    "_test.",
131    "test_",
132    ".test.",
133    ".spec.",
134    "_spec.",
135    "tests/",
136    "__tests__/",
137];
138
139pub fn index_tests(graph: &CodeGraph, project_root: &Path) -> anyhow::Result<EnrichmentStats> {
140    let mut stats = EnrichmentStats::default();
141
142    let output = std::process::Command::new("git")
143        .args(["ls-files"])
144        .current_dir(project_root)
145        .output();
146
147    let files: Vec<String> = match output {
148        Ok(o) if o.status.success() => String::from_utf8_lossy(&o.stdout)
149            .lines()
150            .map(ToString::to_string)
151            .collect(),
152        _ => return Ok(stats),
153    };
154
155    for file in &files {
156        if !is_test_file(file) {
157            continue;
158        }
159
160        let test_node = Node::test(file, file);
161        let test_id = graph.upsert_node(&test_node)?;
162        stats.tests_indexed += 1;
163
164        let tested_file = infer_tested_file(file);
165        if let Some(ref tested) = tested_file {
166            if files.contains(tested) {
167                let target_node = graph.get_node_by_path(tested)?;
168                if let Some(target) = target_node {
169                    if let Some(target_id) = target.id {
170                        graph.upsert_edge(&Edge::new(target_id, test_id, EdgeKind::TestedBy))?;
171                        stats.edges_created += 1;
172                    }
173                } else {
174                    let file_id = graph.upsert_node(&Node::file(tested))?;
175                    graph.upsert_edge(&Edge::new(file_id, test_id, EdgeKind::TestedBy))?;
176                    stats.edges_created += 1;
177                }
178            }
179        }
180    }
181
182    Ok(stats)
183}
184
185fn is_test_file(path: &str) -> bool {
186    let lower = path.to_lowercase();
187    TEST_PATTERNS.iter().any(|p| lower.contains(p))
188}
189
190fn infer_tested_file(test_path: &str) -> Option<String> {
191    let name = Path::new(test_path).file_name()?.to_str()?;
192
193    for pattern in &["_test.", ".test.", "_spec.", ".spec."] {
194        if let Some(pos) = name.find(pattern) {
195            let base = &name[..pos];
196            let ext = &name[pos + pattern.len() - 1..];
197            let parent = Path::new(test_path).parent()?;
198
199            let candidate = parent.join(format!("{base}{ext}"));
200            if let Some(s) = candidate.to_str() {
201                return Some(s.replace('\\', "/"));
202            }
203
204            if let Some(pp) = parent.parent() {
205                let src_candidate = pp.join("src").join(format!("{base}{ext}"));
206                if let Some(s) = src_candidate.to_str() {
207                    return Some(s.replace('\\', "/"));
208                }
209            }
210        }
211    }
212
213    if let Some(base) = name.strip_prefix("test_") {
214        let parent = Path::new(test_path).parent()?;
215        let candidate = parent.join(base);
216        return candidate.to_str().map(|s| s.replace('\\', "/"));
217    }
218
219    None
220}
221
222// ---------------------------------------------------------------------------
223// Knowledge Bridge
224// ---------------------------------------------------------------------------
225
226pub fn index_knowledge(graph: &CodeGraph, project_root: &str) -> anyhow::Result<EnrichmentStats> {
227    let mut stats = EnrichmentStats::default();
228
229    let knowledge = crate::core::knowledge::ProjectKnowledge::load(project_root);
230    let Some(knowledge) = knowledge else {
231        return Ok(stats);
232    };
233
234    let mut mentioned_files: HashSet<String> = HashSet::new();
235
236    for fact in &knowledge.facts {
237        let node = Node::knowledge(&fact.key, &format!("[{}] {}", fact.category, fact.value));
238        let knowledge_id = graph.upsert_node(&node)?;
239        stats.knowledge_indexed += 1;
240
241        for file_ref in extract_file_refs(&fact.value) {
242            if mentioned_files.insert(format!("{}:{}", fact.key, file_ref)) {
243                if let Some(file_node) = graph.get_node_by_path(&file_ref)? {
244                    if let Some(file_id) = file_node.id {
245                        graph.upsert_edge(&Edge::new(
246                            file_id,
247                            knowledge_id,
248                            EdgeKind::MentionedIn,
249                        ))?;
250                        stats.edges_created += 1;
251                    }
252                }
253            }
254        }
255    }
256
257    Ok(stats)
258}
259
260fn extract_file_refs(text: &str) -> Vec<String> {
261    let mut refs = Vec::new();
262    for word in text.split_whitespace() {
263        let cleaned = word.trim_matches(|c: char| c == '`' || c == '\'' || c == '"' || c == ',');
264        if looks_like_file_path(cleaned) {
265            refs.push(cleaned.to_string());
266        }
267    }
268    refs
269}
270
271fn looks_like_file_path(s: &str) -> bool {
272    if s.len() < 4 || s.len() > 200 {
273        return false;
274    }
275    let path = Path::new(s);
276    let has_sep = s.contains('/') || s.contains('\\');
277    match path.extension().and_then(|e| e.to_str()) {
278        Some(ext) => {
279            let ext_lower = ext.to_ascii_lowercase();
280            has_sep
281                || matches!(
282                    ext_lower.as_str(),
283                    "rs" | "ts"
284                        | "py"
285                        | "js"
286                        | "go"
287                        | "java"
288                        | "tsx"
289                        | "jsx"
290                        | "rb"
291                        | "c"
292                        | "cpp"
293                        | "h"
294                        | "cs"
295                        | "swift"
296                        | "kt"
297                )
298        }
299        None => false,
300    }
301}
302
303// ---------------------------------------------------------------------------
304// Full enrichment pipeline
305// ---------------------------------------------------------------------------
306
307#[derive(Debug, Default)]
308pub struct EnrichmentStats {
309    pub commits_indexed: usize,
310    pub tests_indexed: usize,
311    pub knowledge_indexed: usize,
312    pub edges_created: usize,
313}
314
315impl EnrichmentStats {
316    pub fn merge(&mut self, other: &Self) {
317        self.commits_indexed += other.commits_indexed;
318        self.tests_indexed += other.tests_indexed;
319        self.knowledge_indexed += other.knowledge_indexed;
320        self.edges_created += other.edges_created;
321    }
322
323    pub fn format_summary(&self) -> String {
324        format!(
325            "Graph enriched: {} commits, {} tests, {} knowledge entries, {} edges",
326            self.commits_indexed, self.tests_indexed, self.knowledge_indexed, self.edges_created
327        )
328    }
329}
330
331pub fn enrich_graph(
332    graph: &CodeGraph,
333    project_root: &Path,
334    max_commits: usize,
335) -> anyhow::Result<EnrichmentStats> {
336    let mut total = EnrichmentStats::default();
337
338    let git_stats = index_git_history(graph, project_root, max_commits)?;
339    total.merge(&git_stats);
340
341    let test_stats = index_tests(graph, project_root)?;
342    total.merge(&test_stats);
343
344    if let Some(root_str) = project_root.to_str() {
345        let knowledge_stats = index_knowledge(graph, root_str)?;
346        total.merge(&knowledge_stats);
347
348        let callgraph_stats = consolidate_callgraph(graph, root_str)?;
349        total.merge(&callgraph_stats);
350    }
351
352    Ok(total)
353}
354
355fn consolidate_callgraph(graph: &CodeGraph, project_root: &str) -> anyhow::Result<EnrichmentStats> {
356    let mut stats = EnrichmentStats::default();
357
358    let index = crate::core::graph_index::load_or_build(project_root);
359    let call_graph = crate::core::call_graph::CallGraph::load_or_build(project_root, &index);
360
361    let callee_to_file: std::collections::HashMap<&str, &str> = index
362        .symbols
363        .values()
364        .map(|s| (s.name.as_str(), s.file.as_str()))
365        .collect();
366    // TODO(opt1415): Once PropertyGraph stores all symbols, replace
367    // ProjectIndex lookup with: SELECT file_path FROM nodes WHERE kind='symbol' AND name=?
368
369    for edge in &call_graph.edges {
370        let from_file = &edge.caller_file;
371        let to_file = match callee_to_file.get(edge.callee_name.as_str()) {
372            Some(f) => *f,
373            None => continue,
374        };
375
376        if from_file == to_file {
377            continue;
378        }
379
380        let from_node = graph.get_node_by_path(from_file)?;
381        let to_node = graph.get_node_by_path(to_file)?;
382
383        if let (Some(from_n), Some(to_n)) = (from_node, to_node) {
384            if let (Some(from_id), Some(to_id)) = (from_n.id, to_n.id) {
385                graph.upsert_edge(&Edge::new(from_id, to_id, EdgeKind::Calls))?;
386                stats.edges_created += 1;
387            }
388        }
389    }
390
391    Ok(stats)
392}
393
394// ---------------------------------------------------------------------------
395// Tests
396// ---------------------------------------------------------------------------
397
398#[cfg(test)]
399mod tests {
400    use super::*;
401    use crate::core::property_graph::NodeKind;
402
403    #[test]
404    fn parse_git_log_basic() {
405        let log = "abc1234567890abcdef1234567890abcdef12345678\nabc1234\nJohn Doe\n2026-04-28 12:00:00 +0200\nfeat: add feature\nsrc/main.rs\nsrc/lib.rs\n\n";
406        let commits = parse_git_log(log);
407        assert_eq!(commits.len(), 1);
408        assert_eq!(commits[0].short_hash, "abc1234");
409        assert_eq!(commits[0].author, "John Doe");
410        assert_eq!(commits[0].files_changed.len(), 2);
411    }
412
413    #[test]
414    fn parse_git_log_multiple() {
415        let log = "a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6a1b2\na1b2c3d\nAlice\n2026-04-27\nfirst\nfile1.rs\n\nf6e5d4c3b2a1f6e5d4c3b2a1f6e5d4c3b2a1f6e5\nf6e5d4c\nBob\n2026-04-28\nsecond\nfile2.rs\nfile3.rs\n\n";
416        let commits = parse_git_log(log);
417        assert_eq!(commits.len(), 2);
418        assert_eq!(commits[1].files_changed.len(), 2);
419    }
420
421    #[test]
422    fn is_test_file_detection() {
423        assert!(is_test_file("src/utils_test.rs"));
424        assert!(is_test_file("tests/integration.rs"));
425        assert!(is_test_file("src/component.test.ts"));
426        assert!(is_test_file("src/component.spec.js"));
427        assert!(is_test_file("__tests__/app.js"));
428        assert!(!is_test_file("src/main.rs"));
429        assert!(!is_test_file("src/utils.rs"));
430    }
431
432    #[test]
433    fn infer_tested_file_from_test() {
434        assert_eq!(
435            infer_tested_file("src/utils_test.rs"),
436            Some("src/utils.rs".to_string())
437        );
438        assert_eq!(
439            infer_tested_file("src/component.test.ts"),
440            Some("src/component.ts".to_string())
441        );
442        assert_eq!(
443            infer_tested_file("src/app.spec.js"),
444            Some("src/app.js".to_string())
445        );
446    }
447
448    #[test]
449    fn infer_tested_file_prefix() {
450        assert_eq!(
451            infer_tested_file("tests/test_parser.py"),
452            Some("tests/parser.py".to_string())
453        );
454    }
455
456    #[test]
457    fn looks_like_file_path_detection() {
458        assert!(looks_like_file_path("src/main.rs"));
459        assert!(looks_like_file_path("core/utils.ts"));
460        assert!(looks_like_file_path("main.py"));
461        assert!(!looks_like_file_path("hello"));
462        assert!(!looks_like_file_path("a.b"));
463        assert!(!looks_like_file_path(".hidden"));
464    }
465
466    #[test]
467    fn extract_file_refs_from_text() {
468        let text = "Changed `src/main.rs` and core/utils.ts for the fix";
469        let refs = extract_file_refs(text);
470        assert!(refs.contains(&"src/main.rs".to_string()));
471        assert!(refs.contains(&"core/utils.ts".to_string()));
472    }
473
474    #[test]
475    fn enrichment_stats_merge() {
476        let mut a = EnrichmentStats {
477            commits_indexed: 5,
478            tests_indexed: 3,
479            knowledge_indexed: 2,
480            edges_created: 10,
481        };
482        let b = EnrichmentStats {
483            commits_indexed: 2,
484            tests_indexed: 1,
485            knowledge_indexed: 0,
486            edges_created: 4,
487        };
488        a.merge(&b);
489        assert_eq!(a.commits_indexed, 7);
490        assert_eq!(a.edges_created, 14);
491    }
492
493    #[test]
494    fn enrichment_stats_format() {
495        let s = EnrichmentStats {
496            commits_indexed: 10,
497            tests_indexed: 5,
498            knowledge_indexed: 3,
499            edges_created: 20,
500        };
501        let fmt = s.format_summary();
502        assert!(fmt.contains("10 commits"));
503        assert!(fmt.contains("5 tests"));
504    }
505
506    #[test]
507    fn commit_node_construction() {
508        let node = Node::commit("abc1234", "feat: add feature");
509        assert_eq!(node.kind, NodeKind::Commit);
510        assert_eq!(node.name, "abc1234");
511    }
512
513    #[test]
514    fn test_node_construction() {
515        let node = Node::test("src/utils_test.rs", "src/utils_test.rs");
516        assert_eq!(node.kind, NodeKind::Test);
517        assert_eq!(node.file_path, "src/utils_test.rs");
518    }
519
520    #[test]
521    fn knowledge_node_construction() {
522        let node = Node::knowledge("k1", "Database uses PostgreSQL");
523        assert_eq!(node.kind, NodeKind::Knowledge);
524        assert!(node.metadata.unwrap().contains("PostgreSQL"));
525    }
526
527    #[test]
528    fn graph_commit_and_edge() {
529        let g = CodeGraph::open_in_memory().unwrap();
530        let file_id = g.upsert_node(&Node::file("src/main.rs")).unwrap();
531        let commit_id = g.upsert_node(&Node::commit("abc1234", "fix bug")).unwrap();
532        g.upsert_edge(&Edge::new(file_id, commit_id, EdgeKind::ChangedIn))
533            .unwrap();
534
535        let edges = g.edges_from(file_id).unwrap();
536        assert_eq!(edges.len(), 1);
537        assert_eq!(edges[0].kind, EdgeKind::ChangedIn);
538    }
539
540    #[test]
541    fn graph_test_edge() {
542        let g = CodeGraph::open_in_memory().unwrap();
543        let code_id = g.upsert_node(&Node::file("src/utils.rs")).unwrap();
544        let test_id = g
545            .upsert_node(&Node::test("src/utils_test.rs", "test_parse"))
546            .unwrap();
547        g.upsert_edge(&Edge::new(code_id, test_id, EdgeKind::TestedBy))
548            .unwrap();
549
550        let edges = g.edges_from(code_id).unwrap();
551        assert_eq!(edges[0].kind, EdgeKind::TestedBy);
552    }
553
554    #[test]
555    fn graph_knowledge_edge() {
556        let g = CodeGraph::open_in_memory().unwrap();
557        let file_id = g.upsert_node(&Node::file("src/db.rs")).unwrap();
558        let k_id = g
559            .upsert_node(&Node::knowledge("db_type", "Uses PostgreSQL"))
560            .unwrap();
561        g.upsert_edge(&Edge::new(file_id, k_id, EdgeKind::MentionedIn))
562            .unwrap();
563
564        let edges = g.edges_from(file_id).unwrap();
565        assert_eq!(edges[0].kind, EdgeKind::MentionedIn);
566    }
567}