repotoire 0.8.0

Graph-powered code analysis CLI. 110 detectors for security, architecture, bus factor, and code quality.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
//! Git enrichment for the code graph
//!
//! Enriches Function and Class nodes with git history data:
//! - last_modified timestamp
//! - author of last modification
//! - commit_count
//! - Creates Commit nodes and MODIFIED_IN relationships

use anyhow::Result;
use std::collections::HashSet;
use std::path::Path;
use tracing::{debug, info};

use super::blame::GitBlame;
use super::history::GitHistory;
use crate::graph::builder::GraphBuilder;
use crate::graph::store_models::ExtraProps;
use crate::graph::{CodeEdge, CodeNode, EdgeKind, NodeKind};
use crate::models::LineRange;

/// Statistics from git enrichment.
#[derive(Debug, Clone, Default)]
pub struct EnrichmentStats {
    /// Number of functions enriched with git data
    pub functions_enriched: usize,
    /// Number of classes enriched with git data
    pub classes_enriched: usize,
    /// Number of Commit nodes created
    pub commits_created: usize,
    /// Number of MODIFIED_IN edges created
    pub edges_created: usize,
    /// Files skipped (not in git)
    pub files_skipped: usize,
    /// Files loaded from disk cache
    pub cache_hits: usize,
    /// Files computed fresh
    pub cache_misses: usize,
}

/// Git enricher for the code graph.
pub struct GitEnricher<'a> {
    blame: GitBlame,
    #[allow(dead_code)] // Stored for future commit history analysis
    history: &'a GitHistory,
    graph: &'a mut GraphBuilder,
    /// Track commits we've already created
    #[allow(dead_code)] // Used by create_commit_if_needed
    seen_commits: HashSet<String>,
}

impl<'a> GitEnricher<'a> {
    /// Create a new git enricher.
    pub fn new(history: &'a GitHistory, graph: &'a mut GraphBuilder) -> Result<Self> {
        let repo_root = history.repo_root()?;
        let blame = GitBlame::open(repo_root)?;
        Ok(Self {
            blame,
            history,
            graph,
            seen_commits: HashSet::new(),
        })
    }

    /// Enrich all Function and Class nodes with git data.
    pub fn enrich_all(&mut self) -> Result<EnrichmentStats> {
        let mut stats = EnrichmentStats::default();
        let gi = self.graph.interner();

        // Collect all unique files from functions and classes
        let functions = self.graph.get_functions();
        let classes = self.graph.get_classes();

        let mut unique_files: HashSet<String> = HashSet::new();
        for f in &functions {
            let has_last_modified = self
                .graph
                .get_extra_props(f.qualified_name)
                .and_then(|ep| ep.last_modified)
                .is_some();
            if !has_last_modified {
                unique_files.insert(f.path(gi).to_string());
            }
        }
        for c in &classes {
            let has_last_modified = self
                .graph
                .get_extra_props(c.qualified_name)
                .and_then(|ep| ep.last_modified)
                .is_some();
            if !has_last_modified {
                unique_files.insert(c.path(gi).to_string());
            }
        }

        // Pre-warm blame cache in parallel (uses disk cache for unchanged files).
        // Filter out paths that we never want blame metadata for — vendored
        // code, dependency lockfiles, build artifacts, minified bundles. These
        // routinely end up in the graph (an AST parser will still produce
        // entities for a vendored Python file or a lockfile), but blaming them
        // burns commit walks on noise that no detector will use downstream.
        let file_list: Vec<String> = unique_files
            .into_iter()
            .filter(|p| !is_blame_skip_path(p))
            .collect();
        let (cache_hits, cache_misses) = if !file_list.is_empty() {
            info!(
                "Pre-warming git blame cache for {} files...",
                file_list.len()
            );
            let (hits, misses) = self.blame.prewarm_cache(&file_list);
            debug!("Git cache: {} hits, {} computed", hits, misses);
            (hits, misses)
        } else {
            (0, 0)
        };
        stats.cache_hits = cache_hits;
        stats.cache_misses = cache_misses;

        // Enrich Functions (now just cache lookups)
        info!("Enriching Function nodes with git history...");
        let func_stats = self.enrich_functions()?;
        stats.functions_enriched = func_stats.functions_enriched;
        stats.commits_created += func_stats.commits_created;
        stats.edges_created += func_stats.edges_created;

        // Enrich Classes (now just cache lookups)
        info!("Enriching Class nodes with git history...");
        let class_stats = self.enrich_classes()?;
        stats.classes_enriched = class_stats.classes_enriched;
        stats.commits_created += class_stats.commits_created;
        stats.edges_created += class_stats.edges_created;

        info!(
            "Git enrichment complete: {} functions, {} classes, {} commits, {} edges",
            stats.functions_enriched,
            stats.classes_enriched,
            stats.commits_created,
            stats.edges_created
        );

        // Persist any blame entries computed during per-entity enrichment that
        // arrived after prewarm_cache saved. Fire-and-forget — a cache-write
        // failure must not fail the analysis.
        if let Err(e) = self.blame.save_cache() {
            debug!("Git blame cache save failed (ignored): {e}");
        }

        Ok(stats)
    }

    /// Enrich Function nodes with git data.
    fn enrich_functions(&mut self) -> Result<EnrichmentStats> {
        let mut stats = EnrichmentStats::default();
        let gi = self.graph.interner();

        // Get all functions without git data
        let functions = self.graph.get_functions();
        let functions_to_enrich: Vec<_> = functions
            .into_iter()
            .filter(|f| {
                self.graph
                    .get_extra_props(f.qualified_name)
                    .and_then(|ep| ep.last_modified)
                    .is_none()
            })
            .collect();

        let total = functions_to_enrich.len();
        debug!("Found {} functions to enrich", total);

        for (i, func) in functions_to_enrich.into_iter().enumerate() {
            if i > 0 && i % 500 == 0 {
                debug!("Enriched {}/{} functions", i, total);
            }

            let line_start = func.line_start;
            let line_end = func.line_end;

            if line_start == 0 {
                continue;
            }

            // Get blame info for this function
            let blame_result = self
                .blame
                .get_entity_blame(func.path(gi), LineRange::new(line_start, line_end))
                .inspect_err(|e| {
                    debug!(
                        "Failed to get blame for {}:{}: {}",
                        func.path(gi),
                        line_start,
                        e
                    );
                });
            let Ok(blame_info) = blame_result else {
                stats.files_skipped += 1;
                continue;
            };
            let Some(last_modified) = &blame_info.last_modified else {
                continue;
            };
            let Some(author) = &blame_info.last_author else {
                continue;
            };
            self.graph.update_node_properties(
                func.qn(gi),
                &[
                    (
                        "last_modified",
                        serde_json::Value::String(last_modified.clone()),
                    ),
                    ("author", serde_json::Value::String(author.clone())),
                    (
                        "commit_count",
                        serde_json::Value::Number((blame_info.commit_count as i64).into()),
                    ),
                ],
            );
            stats.functions_enriched += 1;
        }

        Ok(stats)
    }

    /// Enrich Class nodes with git data.
    fn enrich_classes(&mut self) -> Result<EnrichmentStats> {
        let mut stats = EnrichmentStats::default();
        let gi = self.graph.interner();

        // Get all classes without git data
        let classes = self.graph.get_classes();
        let classes_to_enrich: Vec<_> = classes
            .into_iter()
            .filter(|c| {
                self.graph
                    .get_extra_props(c.qualified_name)
                    .and_then(|ep| ep.last_modified)
                    .is_none()
            })
            .collect();

        let total = classes_to_enrich.len();
        debug!("Found {} classes to enrich", total);

        for (i, class) in classes_to_enrich.into_iter().enumerate() {
            if i > 0 && i % 50 == 0 {
                debug!("Enriched {}/{} classes", i, total);
            }

            let line_start = class.line_start;
            let line_end = class.line_end;

            if line_start == 0 {
                continue;
            }

            // Get blame info for this class
            let blame_result = self
                .blame
                .get_entity_blame(class.path(gi), LineRange::new(line_start, line_end))
                .inspect_err(|e| {
                    debug!(
                        "Failed to get blame for {}:{}: {}",
                        class.path(gi),
                        line_start,
                        e
                    );
                });
            let Ok(blame_info) = blame_result else {
                stats.files_skipped += 1;
                continue;
            };

            let (Some(last_modified), Some(author)) =
                (&blame_info.last_modified, &blame_info.last_author)
            else {
                continue;
            };

            // Update class with git data (skip Commit nodes for speed)
            self.graph.update_node_properties(
                class.qn(gi),
                &[
                    (
                        "last_modified",
                        serde_json::Value::String(last_modified.clone()),
                    ),
                    ("author", serde_json::Value::String(author.clone())),
                    (
                        "commit_count",
                        serde_json::Value::Number((blame_info.commit_count as i64).into()),
                    ),
                ],
            );
            stats.classes_enriched += 1;
        }

        Ok(stats)
    }

    /// Create a Commit node if it doesn't already exist.
    #[allow(dead_code)] // Infrastructure for git graph enrichment
    fn create_commit_if_needed(&mut self, hash: &str, author: &str, timestamp: &str) -> bool {
        if self.seen_commits.contains(hash) {
            return false;
        }

        // Create commit node
        let i = self.graph.interner();
        let empty = i.empty_key();
        let hash_key = i.intern(hash);
        let node = CodeNode {
            kind: NodeKind::Commit,
            name: hash_key,
            qualified_name: hash_key,
            file_path: empty,
            language: empty,
            line_start: 0,
            line_end: 0,
            complexity: 0,
            param_count: 0,
            method_count: 0,
            field_count: 0,
            max_nesting: 0,
            return_count: 0,
            commit_count: 0,
            flags: 0,
        };
        self.graph.add_node(node);

        // Store author/timestamp in extra_props side table
        let ep = ExtraProps {
            author: Some(i.intern(author)),
            last_modified: Some(i.intern(timestamp)),
            ..Default::default()
        };
        self.graph.set_extra_props(hash_key, ep);
        self.seen_commits.insert(hash.to_string());
        true
    }

    /// Create a MODIFIED_IN edge from entity to commit.
    #[allow(dead_code)] // Infrastructure for git graph enrichment
    fn create_modified_in_edge(&mut self, entity_qn: &str, commit_hash: &str) -> bool {
        self.graph
            .add_edge_by_name(entity_qn, commit_hash, CodeEdge::new(EdgeKind::ModifiedIn))
    }
}

/// Convenience function to enrich a graph with git data.
pub fn enrich_graph_with_git(
    repo_path: &Path,
    graph: &mut GraphBuilder,
    _repo_id: Option<&str>,
) -> Result<EnrichmentStats> {
    let history = GitHistory::new(repo_path)?;
    let mut enricher = GitEnricher::new(&history, graph)?;
    enricher.enrich_all()
}

/// Path patterns that never warrant a blame walk. Vendored and generated code
/// ends up in the graph because the AST parsers happily produce entities for
/// it, but blaming it burns commit walks on files whose history isn't a signal
/// any downstream detector consumes. Lockfiles and minified bundles are here
/// for the same reason — their blame is either noise or single-committer spam.
fn is_blame_skip_path(path: &str) -> bool {
    const SKIP_SEGMENTS: &[&str] = &[
        "/vendor/",
        "/node_modules/",
        "/dist/",
        "/build/",
        "/target/",
        "/.venv/",
        "/__pycache__/",
        "/third_party/",
        "/third-party/",
    ];
    const SKIP_SUFFIXES: &[&str] = &[
        ".lock",
        ".min.js",
        ".min.css",
        ".map",
        "package-lock.json",
        "yarn.lock",
        "pnpm-lock.yaml",
        "Cargo.lock",
        "poetry.lock",
        "composer.lock",
        "Gemfile.lock",
    ];
    // Check vendor/build segments anywhere in the path, and suffix matches.
    // Normalize separators so we catch both Unix and Windows-style paths.
    let normalized = path.replace('\\', "/");
    let with_boundary = format!("/{normalized}");
    if SKIP_SEGMENTS.iter().any(|seg| with_boundary.contains(seg)) {
        return true;
    }
    SKIP_SUFFIXES
        .iter()
        .any(|suffix| normalized.ends_with(suffix))
}

#[cfg(test)]
mod skip_path_tests {
    use super::is_blame_skip_path;

    #[test]
    fn skips_vendor_and_node_modules() {
        assert!(is_blame_skip_path("vendor/foo/bar.py"));
        assert!(is_blame_skip_path("src/vendor/x.rs"));
        assert!(is_blame_skip_path("app/node_modules/react/index.js"));
    }

    #[test]
    fn skips_lockfiles_and_minified() {
        assert!(is_blame_skip_path("yarn.lock"));
        assert!(is_blame_skip_path("src/bundle.min.js"));
        assert!(is_blame_skip_path("Cargo.lock"));
    }

    #[test]
    fn keeps_normal_sources() {
        assert!(!is_blame_skip_path("src/main.rs"));
        assert!(!is_blame_skip_path("app/foo.py"));
        assert!(!is_blame_skip_path("pkg/auth/session.go"));
    }

    #[test]
    fn does_not_false_positive_on_substrings() {
        // "vendors" (plural) should not match "/vendor/"
        assert!(!is_blame_skip_path("src/vendors/v.rs"));
    }
}