use anyhow::Result;
use std::collections::HashSet;
use std::path::Path;
use tracing::{debug, info};
use super::blame::GitBlame;
use super::history::GitHistory;
use crate::graph::builder::GraphBuilder;
use crate::graph::store_models::ExtraProps;
use crate::graph::{CodeEdge, CodeNode, EdgeKind, NodeKind};
use crate::models::LineRange;
#[derive(Debug, Clone, Default)]
pub struct EnrichmentStats {
pub functions_enriched: usize,
pub classes_enriched: usize,
pub commits_created: usize,
pub edges_created: usize,
pub files_skipped: usize,
pub cache_hits: usize,
pub cache_misses: usize,
}
pub struct GitEnricher<'a> {
blame: GitBlame,
#[allow(dead_code)] history: &'a GitHistory,
graph: &'a mut GraphBuilder,
#[allow(dead_code)] seen_commits: HashSet<String>,
}
impl<'a> GitEnricher<'a> {
pub fn new(history: &'a GitHistory, graph: &'a mut GraphBuilder) -> Result<Self> {
let repo_root = history.repo_root()?;
let blame = GitBlame::open(repo_root)?;
Ok(Self {
blame,
history,
graph,
seen_commits: HashSet::new(),
})
}
pub fn enrich_all(&mut self) -> Result<EnrichmentStats> {
let mut stats = EnrichmentStats::default();
let gi = self.graph.interner();
let functions = self.graph.get_functions();
let classes = self.graph.get_classes();
let mut unique_files: HashSet<String> = HashSet::new();
for f in &functions {
let has_last_modified = self
.graph
.get_extra_props(f.qualified_name)
.and_then(|ep| ep.last_modified)
.is_some();
if !has_last_modified {
unique_files.insert(f.path(gi).to_string());
}
}
for c in &classes {
let has_last_modified = self
.graph
.get_extra_props(c.qualified_name)
.and_then(|ep| ep.last_modified)
.is_some();
if !has_last_modified {
unique_files.insert(c.path(gi).to_string());
}
}
let file_list: Vec<String> = unique_files
.into_iter()
.filter(|p| !is_blame_skip_path(p))
.collect();
let (cache_hits, cache_misses) = if !file_list.is_empty() {
info!(
"Pre-warming git blame cache for {} files...",
file_list.len()
);
let (hits, misses) = self.blame.prewarm_cache(&file_list);
debug!("Git cache: {} hits, {} computed", hits, misses);
(hits, misses)
} else {
(0, 0)
};
stats.cache_hits = cache_hits;
stats.cache_misses = cache_misses;
info!("Enriching Function nodes with git history...");
let func_stats = self.enrich_functions()?;
stats.functions_enriched = func_stats.functions_enriched;
stats.commits_created += func_stats.commits_created;
stats.edges_created += func_stats.edges_created;
info!("Enriching Class nodes with git history...");
let class_stats = self.enrich_classes()?;
stats.classes_enriched = class_stats.classes_enriched;
stats.commits_created += class_stats.commits_created;
stats.edges_created += class_stats.edges_created;
info!(
"Git enrichment complete: {} functions, {} classes, {} commits, {} edges",
stats.functions_enriched,
stats.classes_enriched,
stats.commits_created,
stats.edges_created
);
if let Err(e) = self.blame.save_cache() {
debug!("Git blame cache save failed (ignored): {e}");
}
Ok(stats)
}
fn enrich_functions(&mut self) -> Result<EnrichmentStats> {
let mut stats = EnrichmentStats::default();
let gi = self.graph.interner();
let functions = self.graph.get_functions();
let functions_to_enrich: Vec<_> = functions
.into_iter()
.filter(|f| {
self.graph
.get_extra_props(f.qualified_name)
.and_then(|ep| ep.last_modified)
.is_none()
})
.collect();
let total = functions_to_enrich.len();
debug!("Found {} functions to enrich", total);
for (i, func) in functions_to_enrich.into_iter().enumerate() {
if i > 0 && i % 500 == 0 {
debug!("Enriched {}/{} functions", i, total);
}
let line_start = func.line_start;
let line_end = func.line_end;
if line_start == 0 {
continue;
}
let blame_result = self
.blame
.get_entity_blame(func.path(gi), LineRange::new(line_start, line_end))
.inspect_err(|e| {
debug!(
"Failed to get blame for {}:{}: {}",
func.path(gi),
line_start,
e
);
});
let Ok(blame_info) = blame_result else {
stats.files_skipped += 1;
continue;
};
let Some(last_modified) = &blame_info.last_modified else {
continue;
};
let Some(author) = &blame_info.last_author else {
continue;
};
self.graph.update_node_properties(
func.qn(gi),
&[
(
"last_modified",
serde_json::Value::String(last_modified.clone()),
),
("author", serde_json::Value::String(author.clone())),
(
"commit_count",
serde_json::Value::Number((blame_info.commit_count as i64).into()),
),
],
);
stats.functions_enriched += 1;
}
Ok(stats)
}
fn enrich_classes(&mut self) -> Result<EnrichmentStats> {
let mut stats = EnrichmentStats::default();
let gi = self.graph.interner();
let classes = self.graph.get_classes();
let classes_to_enrich: Vec<_> = classes
.into_iter()
.filter(|c| {
self.graph
.get_extra_props(c.qualified_name)
.and_then(|ep| ep.last_modified)
.is_none()
})
.collect();
let total = classes_to_enrich.len();
debug!("Found {} classes to enrich", total);
for (i, class) in classes_to_enrich.into_iter().enumerate() {
if i > 0 && i % 50 == 0 {
debug!("Enriched {}/{} classes", i, total);
}
let line_start = class.line_start;
let line_end = class.line_end;
if line_start == 0 {
continue;
}
let blame_result = self
.blame
.get_entity_blame(class.path(gi), LineRange::new(line_start, line_end))
.inspect_err(|e| {
debug!(
"Failed to get blame for {}:{}: {}",
class.path(gi),
line_start,
e
);
});
let Ok(blame_info) = blame_result else {
stats.files_skipped += 1;
continue;
};
let (Some(last_modified), Some(author)) =
(&blame_info.last_modified, &blame_info.last_author)
else {
continue;
};
self.graph.update_node_properties(
class.qn(gi),
&[
(
"last_modified",
serde_json::Value::String(last_modified.clone()),
),
("author", serde_json::Value::String(author.clone())),
(
"commit_count",
serde_json::Value::Number((blame_info.commit_count as i64).into()),
),
],
);
stats.classes_enriched += 1;
}
Ok(stats)
}
#[allow(dead_code)] fn create_commit_if_needed(&mut self, hash: &str, author: &str, timestamp: &str) -> bool {
if self.seen_commits.contains(hash) {
return false;
}
let i = self.graph.interner();
let empty = i.empty_key();
let hash_key = i.intern(hash);
let node = CodeNode {
kind: NodeKind::Commit,
name: hash_key,
qualified_name: hash_key,
file_path: empty,
language: empty,
line_start: 0,
line_end: 0,
complexity: 0,
param_count: 0,
method_count: 0,
field_count: 0,
max_nesting: 0,
return_count: 0,
commit_count: 0,
flags: 0,
};
self.graph.add_node(node);
let ep = ExtraProps {
author: Some(i.intern(author)),
last_modified: Some(i.intern(timestamp)),
..Default::default()
};
self.graph.set_extra_props(hash_key, ep);
self.seen_commits.insert(hash.to_string());
true
}
#[allow(dead_code)] fn create_modified_in_edge(&mut self, entity_qn: &str, commit_hash: &str) -> bool {
self.graph
.add_edge_by_name(entity_qn, commit_hash, CodeEdge::new(EdgeKind::ModifiedIn))
}
}
pub fn enrich_graph_with_git(
repo_path: &Path,
graph: &mut GraphBuilder,
_repo_id: Option<&str>,
) -> Result<EnrichmentStats> {
let history = GitHistory::new(repo_path)?;
let mut enricher = GitEnricher::new(&history, graph)?;
enricher.enrich_all()
}
fn is_blame_skip_path(path: &str) -> bool {
const SKIP_SEGMENTS: &[&str] = &[
"/vendor/",
"/node_modules/",
"/dist/",
"/build/",
"/target/",
"/.venv/",
"/__pycache__/",
"/third_party/",
"/third-party/",
];
const SKIP_SUFFIXES: &[&str] = &[
".lock",
".min.js",
".min.css",
".map",
"package-lock.json",
"yarn.lock",
"pnpm-lock.yaml",
"Cargo.lock",
"poetry.lock",
"composer.lock",
"Gemfile.lock",
];
let normalized = path.replace('\\', "/");
let with_boundary = format!("/{normalized}");
if SKIP_SEGMENTS.iter().any(|seg| with_boundary.contains(seg)) {
return true;
}
SKIP_SUFFIXES
.iter()
.any(|suffix| normalized.ends_with(suffix))
}
#[cfg(test)]
mod skip_path_tests {
use super::is_blame_skip_path;
#[test]
fn skips_vendor_and_node_modules() {
assert!(is_blame_skip_path("vendor/foo/bar.py"));
assert!(is_blame_skip_path("src/vendor/x.rs"));
assert!(is_blame_skip_path("app/node_modules/react/index.js"));
}
#[test]
fn skips_lockfiles_and_minified() {
assert!(is_blame_skip_path("yarn.lock"));
assert!(is_blame_skip_path("src/bundle.min.js"));
assert!(is_blame_skip_path("Cargo.lock"));
}
#[test]
fn keeps_normal_sources() {
assert!(!is_blame_skip_path("src/main.rs"));
assert!(!is_blame_skip_path("app/foo.py"));
assert!(!is_blame_skip_path("pkg/auth/session.go"));
}
#[test]
fn does_not_false_positive_on_substrings() {
assert!(!is_blame_skip_path("src/vendors/v.rs"));
}
}