repotoire 0.8.2

Graph-powered code analysis CLI. 110 detectors for security, architecture, bus factor, and code quality.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
//! Stage 6: Detector execution.

use crate::calibrate::{NgramModel, StyleProfile};
use crate::config::ProjectConfig;
use crate::detectors::analysis_context::FileChurnInfo;
use crate::detectors::base::DetectorScope;
use crate::detectors::{
    apply_hmm_context_filter, build_threshold_resolver, create_all_detectors,
    create_default_detectors, filter_test_file_findings, inject_taint_precomputed,
    precompute_gd_startup, run_detectors, sort_findings_deterministic, DetectorInit,
    PrecomputedAnalysis,
};
use crate::engine::ProgressFn;
use crate::graph::GraphQuery;
use crate::models::Finding;
use crate::values::store::ValueStore;
use anyhow::Result;
use std::collections::{HashMap, HashSet};
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::time::{Duration, Instant};

/// Input for the detect stage.
pub struct DetectInput<'a> {
    pub graph: &'a dyn GraphQuery,
    pub source_files: &'a [PathBuf],
    pub repo_path: &'a Path,
    pub project_config: &'a ProjectConfig,
    pub style_profile: Option<&'a StyleProfile>,
    pub ngram_model: Option<&'a NgramModel>,
    pub value_store: Option<&'a Arc<ValueStore>>,
    pub skip_detectors: &'a [String],
    pub workers: usize,
    pub progress: Option<ProgressFn>,

    /// Per-file git churn data from git enrichment stage.
    pub file_churn: Arc<HashMap<String, FileChurnInfo>>,

    /// Full co-change matrix for pairwise file coupling queries.
    pub co_change_matrix: Option<Arc<crate::git::co_change::CoChangeMatrix>>,

    /// DOA-based file ownership model for bus factor analysis.
    pub ownership: Option<Arc<crate::git::ownership::OwnershipModel>>,

    /// L3 cached node2vec embeddings for relational scoring.
    pub cached_embeddings: Option<Arc<crate::predictive::embedding_scorer::CachedEmbeddings>>,

    /// Run all detectors including deep-scan detectors (default: false).
    pub all_detectors: bool,

    // Incremental optimization hints (engine provides these)
    pub changed_files: Option<&'a [PathBuf]>,
    /// Files removed since the previous analyze run. Cached findings for these
    /// files must be expunged so we don't carry forward phantom findings for
    /// paths that no longer exist.
    pub removed_files: Option<&'a [PathBuf]>,
    pub topology_changed: bool,
    pub cached_gd_precomputed: Option<&'a PrecomputedAnalysis>,
    pub cached_file_findings: Option<&'a HashMap<PathBuf, Vec<Finding>>>,
    pub cached_graph_wide_findings: Option<&'a HashMap<String, Vec<Finding>>>,
}

/// Statistics from the detect stage.
pub struct DetectStats {
    pub detectors_run: usize,
    pub detectors_skipped: usize,
    pub gi_findings: usize,
    pub gd_findings: usize,
    pub precompute_duration: Duration,
}

/// Output from the detect stage.
pub struct DetectOutput {
    /// New findings that need postprocessing (dedup, suppression, GBDT filtering).
    pub findings: Vec<Finding>,
    /// Already-postprocessed findings carried forward from cache.
    /// These bypass the postprocessor and are merged after postprocessing.
    pub cached_findings: Vec<Finding>,
    pub precomputed: PrecomputedAnalysis,
    pub findings_by_file: HashMap<PathBuf, Vec<Finding>>,
    /// Keyed by detector name for selective invalidation on incremental runs.
    pub graph_wide_findings: HashMap<String, Vec<Finding>>,
    /// Detector names that opt out of GBDT postprocessor filtering.
    pub bypass_set: HashSet<String>,
    pub stats: DetectStats,
}

/// Build detectors, precompute shared data, run all detectors in parallel.
pub fn detect_stage(input: &DetectInput) -> Result<DetectOutput> {
    let skip_set: HashSet<&str> = input.skip_detectors.iter().map(|s| s.as_str()).collect();

    // Build DetectorInit and create all detectors via the registry
    let resolver = build_threshold_resolver(input.style_profile);

    let init = DetectorInit {
        repo_path: input.repo_path,
        project_config: input.project_config,
        resolver: resolver.clone(),
        ngram_model: input.ngram_model,
    };

    let detectors: Vec<Arc<dyn crate::detectors::Detector>> = if input.all_detectors {
        create_all_detectors(&init)
    } else {
        create_default_detectors(&init)
    }
    .into_iter()
    .filter(|d| !skip_set.contains(d.name()))
    .collect();

    let detectors_run = detectors.len();
    let detectors_skipped = skip_set.len();

    let graph = input.graph;

    // Incremental fast path: if all three hints are present, dispatch to
    // the incremental function which only re-runs detectors on changed files.
    if let (Some(changed_files), Some(cached_file_findings), Some(cached_graph_wide_findings)) = (
        input.changed_files,
        input.cached_file_findings,
        input.cached_graph_wide_findings,
    ) {
        return detect_stage_incremental(
            input,
            &detectors,
            changed_files,
            input.removed_files.unwrap_or(&[]),
            cached_file_findings,
            cached_graph_wide_findings,
            &resolver,
        );
    }

    // Precompute GD data (contexts, HMM, taint, etc.)
    let precompute_start = Instant::now();
    let hmm_cache_path = input.repo_path.join(".repotoire");
    let vs_clone = input.value_store.cloned();

    let mut precomputed = precompute_gd_startup(
        graph,
        input.repo_path,
        Some(&hmm_cache_path),
        input.source_files,
        vs_clone,
        &detectors,
    );
    let precompute_duration = precompute_start.elapsed();

    // Inject pre-computed taint results into security detectors
    inject_taint_precomputed(&detectors, &precomputed.taint_results);

    // Inject git churn data, co-change matrix, ownership, and embeddings into precomputed analysis
    precomputed.git_churn = Arc::clone(&input.file_churn);
    precomputed.co_change_matrix = input.co_change_matrix.as_ref().map(Arc::clone);
    precomputed.ownership = input.ownership.as_ref().map(Arc::clone);
    precomputed.cached_embeddings = input.cached_embeddings.as_ref().map(Arc::clone);

    // Build analysis context from precomputed data; plumb the
    // dual-branch config through (Phase 1d wired the config; Phase 2a
    // is the first detector to consume it).
    let ctx = precomputed
        .to_context(graph, &resolver)
        .with_dual_branch(input.project_config.dual_branch.clone());

    // Run all detectors in parallel
    let (mut findings, bypass_set) = run_detectors(&detectors, &ctx, input.workers);

    let total_findings = findings.len();

    // Post-detection filters
    findings = apply_hmm_context_filter(findings, &ctx);
    filter_test_file_findings(&mut findings);
    sort_findings_deterministic(&mut findings);

    // Build a scope lookup so we route by detector_scope(), not affected_files.
    // Graph-wide detectors (e.g. MutualRecursionDetector, SinglePointOfFailureDetector)
    // may set affected_files but should still be keyed by detector name for selective
    // invalidation on incremental runs.
    let scope_map: HashMap<String, DetectorScope> = detectors
        .iter()
        .map(|d| (d.name().to_string(), d.detector_scope()))
        .collect();

    // Partition findings into per-file and graph-wide
    let mut findings_by_file: HashMap<PathBuf, Vec<Finding>> = HashMap::new();
    let mut graph_wide_findings: HashMap<String, Vec<Finding>> = HashMap::new();

    for finding in &findings {
        let scope = scope_map
            .get(&finding.detector)
            .copied()
            .unwrap_or(DetectorScope::FileScopedGraph);
        if scope == DetectorScope::GraphWide {
            // Graph-wide finding — key by detector name
            graph_wide_findings
                .entry(finding.detector.clone())
                .or_default()
                .push(finding.clone());
        } else {
            // File-specific finding
            for file in &finding.affected_files {
                findings_by_file
                    .entry(file.clone())
                    .or_default()
                    .push(finding.clone());
            }
        }
    }

    Ok(DetectOutput {
        findings,
        cached_findings: Vec::new(), // cold path — everything goes through postprocessor
        precomputed,
        findings_by_file,
        graph_wide_findings,
        bypass_set,
        stats: DetectStats {
            detectors_run,
            detectors_skipped,
            gi_findings: 0,              // unified run — no GI/GD split
            gd_findings: total_findings, // all findings from unified run
            precompute_duration,
        },
    })
}

// ── Incremental fast path ────────────────────────────────────────────────────

/// Run detectors incrementally: per-file detectors only on changed files,
/// graph-wide detectors reuse cached findings when topology is unchanged.
fn detect_stage_incremental(
    input: &DetectInput,
    detectors: &[Arc<dyn crate::detectors::Detector>],
    changed_files: &[PathBuf],
    removed_files: &[PathBuf],
    cached_file_findings: &HashMap<PathBuf, Vec<Finding>>,
    cached_graph_wide_findings: &HashMap<String, Vec<Finding>>,
    resolver: &crate::calibrate::ThresholdResolver,
) -> Result<DetectOutput> {
    let graph = input.graph;
    // `changed_files` / `removed_files` arrive as absolute paths from the
    // engine's diff stage, but `cached_file_findings` is keyed by the
    // relative paths that detectors emit in `Finding::affected_files`.
    // Normalize both inputs to relative paths so the carry-forward filter
    // actually matches.  Without this, both removed and modified files
    // leak their cached findings into the next run as phantoms.
    let normalize = |p: &PathBuf| -> PathBuf {
        p.strip_prefix(input.repo_path)
            .map(|rel| rel.to_path_buf())
            .unwrap_or_else(|_| p.clone())
    };
    let changed_rel: Vec<PathBuf> = changed_files.iter().map(normalize).collect();
    let removed_rel: Vec<PathBuf> = removed_files.iter().map(normalize).collect();
    let changed_set: HashSet<&PathBuf> = changed_rel.iter().collect();
    let removed_set: HashSet<&PathBuf> = removed_rel.iter().collect();

    // Partition detectors by scope
    let mut file_local = Vec::new();
    let mut file_scoped_graph = Vec::new();
    let mut graph_wide_detectors = Vec::new();

    for d in detectors {
        match d.detector_scope() {
            DetectorScope::FileLocal => file_local.push(Arc::clone(d)),
            DetectorScope::FileScopedGraph => file_scoped_graph.push(Arc::clone(d)),
            DetectorScope::GraphWide => graph_wide_detectors.push(Arc::clone(d)),
        }
    }

    // Precompute: reuse cached when topology unchanged, else full recompute
    let precompute_start = Instant::now();
    let precomputed = if let Some(cached) = input
        .cached_gd_precomputed
        .filter(|_| !input.topology_changed)
    {
        // Fast path: reuse cached PrecomputedAnalysis
        // Re-run TAINT because changed files may have new sinks/sources
        let mut reused = cached.clone(); // cheap: all Arc bumps
        reused.git_churn = Arc::clone(&input.file_churn);
        reused.co_change_matrix = input.co_change_matrix.as_ref().map(Arc::clone);
        reused.ownership = input.ownership.as_ref().map(Arc::clone);
        reused.cached_embeddings = input.cached_embeddings.as_ref().map(Arc::clone);

        let needs_taint = detectors.iter().any(|d| d.taint_category().is_some());
        if needs_taint {
            let taint = crate::detectors::taint::centralized::run_centralized_taint(
                graph,
                input.repo_path,
                None,
            );
            reused.taint_results = Arc::new(taint);
        }

        // Rebuild file index: keep cached entries for unchanged files,
        // add fresh content for changed files
        let changed_set_fi: HashSet<&PathBuf> = changed_files.iter().collect();
        let mut file_data: Vec<_> = reused
            .file_index
            .all()
            .iter()
            .filter(|entry| !changed_set_fi.contains(&entry.path))
            .map(|entry| (entry.path.clone(), Arc::clone(&entry.content), entry.flags))
            .collect();
        for p in changed_files {
            if let Some(content_string) = crate::cache::global_cache().content(p) {
                let content: Arc<str> = Arc::from(content_string.as_str());
                let flags = crate::detectors::detector_context::compute_content_flags(&content);
                file_data.push((p.clone(), content, flags));
            }
        }
        reused.file_index = Arc::new(crate::detectors::file_index::FileIndex::new(file_data));

        inject_taint_precomputed(detectors, &reused.taint_results);
        reused
    } else {
        // Slow path: full precompute
        let hmm_cache_path = input.repo_path.join(".repotoire");
        let vs_clone = input.value_store.cloned();
        let mut precomputed = precompute_gd_startup(
            graph,
            input.repo_path,
            Some(&hmm_cache_path),
            input.source_files,
            vs_clone,
            detectors,
        );
        inject_taint_precomputed(detectors, &precomputed.taint_results);
        precomputed.git_churn = Arc::clone(&input.file_churn);
        precomputed.co_change_matrix = input.co_change_matrix.as_ref().map(Arc::clone);
        precomputed.ownership = input.ownership.as_ref().map(Arc::clone);
        precomputed.cached_embeddings = input.cached_embeddings.as_ref().map(Arc::clone);
        precomputed
    };
    let precompute_duration = precompute_start.elapsed();

    // Build contexts: scoped (changed files only) and full (all files);
    // both carry the dual-branch config (see top of detect.rs).
    let scoped_ctx = precomputed
        .to_context_scoped(graph, resolver, changed_files)
        .with_dual_branch(input.project_config.dual_branch.clone());
    let full_ctx = precomputed
        .to_context(graph, resolver)
        .with_dual_branch(input.project_config.dual_branch.clone());

    let mut new_findings: Vec<Finding> = Vec::new();
    let mut cached_findings_out: Vec<Finding> = Vec::new();
    let mut findings_by_file: HashMap<PathBuf, Vec<Finding>> = HashMap::new();
    let mut graph_wide_findings_out: HashMap<String, Vec<Finding>> = HashMap::new();

    // Pre-build bypass_set from ALL detectors (not just those that run)
    let bypass_set: HashSet<String> = detectors
        .iter()
        .filter(|d| d.bypass_postprocessor())
        .map(|d| d.name().to_string())
        .collect();

    // 1. Carry forward cached findings for UNCHANGED files → cached (already postprocessed).
    //    Skip files that are now removed — their findings are stale and would
    //    surface as phantom findings against non-existent paths.
    for (file, findings) in cached_file_findings {
        if !changed_set.contains(file) && !removed_set.contains(file) {
            findings_by_file.insert(file.clone(), findings.clone());
            cached_findings_out.extend(findings.iter().cloned());
        }
    }

    // 2. Run FileLocal detectors on CHANGED files only → new (needs postprocessing).
    //    Skip network-bound detectors (e.g., DepAuditDetector) — their cached
    //    findings are already carried forward in step 1.
    let file_local_fast: Vec<_> = file_local
        .iter()
        .filter(|d| !d.is_network_bound())
        .cloned()
        .collect();
    if !file_local_fast.is_empty() {
        let (mut fl_findings, _) = run_detectors(&file_local_fast, &scoped_ctx, input.workers);
        fl_findings = apply_hmm_context_filter(fl_findings, &scoped_ctx);
        filter_test_file_findings(&mut fl_findings);
        for f in &fl_findings {
            for file in &f.affected_files {
                findings_by_file
                    .entry(file.clone())
                    .or_default()
                    .push(f.clone());
            }
        }
        new_findings.extend(fl_findings);
    }

    // 3. FileScopedGraph detectors:
    //    - Topology unchanged: re-run on changed files only (scoped ctx). The
    //      changed files were excluded from the cached carry-forward in step 1,
    //      so without this they would be missed entirely.
    //    - Topology changed: cross-file edges may have shifted findings on
    //      unchanged files too. Drop carry-forward findings emitted by FSG
    //      detectors and re-run against the full ctx.
    if !file_scoped_graph.is_empty() {
        let fsg_names: HashSet<String> = file_scoped_graph
            .iter()
            .map(|d| d.name().to_string())
            .collect();

        if input.topology_changed {
            // Drop FSG-emitted findings from carry-forward state.
            for findings in findings_by_file.values_mut() {
                findings.retain(|f| !fsg_names.contains(&f.detector));
            }
            findings_by_file.retain(|_, v| !v.is_empty());
            cached_findings_out.retain(|f| !fsg_names.contains(&f.detector));
        }

        let should_run = input.topology_changed || !changed_files.is_empty();
        if should_run {
            let ctx_to_use: &crate::detectors::analysis_context::AnalysisContext =
                if input.topology_changed {
                    &full_ctx
                } else {
                    &scoped_ctx
                };
            let (mut fsg_findings, _) =
                run_detectors(&file_scoped_graph, ctx_to_use, input.workers);
            fsg_findings = apply_hmm_context_filter(fsg_findings, ctx_to_use);
            filter_test_file_findings(&mut fsg_findings);
            for f in &fsg_findings {
                for file in &f.affected_files {
                    findings_by_file
                        .entry(file.clone())
                        .or_default()
                        .push(f.clone());
                }
            }
            new_findings.extend(fsg_findings);
        }
    }

    // 4. GraphWide detectors: re-run if topology changed → new, else reuse cache → cached
    if input.topology_changed {
        let (mut gw_findings, _) = run_detectors(&graph_wide_detectors, &full_ctx, input.workers);
        gw_findings = apply_hmm_context_filter(gw_findings, &full_ctx);
        filter_test_file_findings(&mut gw_findings);
        for f in &gw_findings {
            graph_wide_findings_out
                .entry(f.detector.clone())
                .or_default()
                .push(f.clone());
        }
        new_findings.extend(gw_findings);
    } else {
        for (detector, findings) in cached_graph_wide_findings {
            graph_wide_findings_out.insert(detector.clone(), findings.clone());
            cached_findings_out.extend(findings.iter().cloned());
        }
    }

    sort_findings_deterministic(&mut new_findings);

    Ok(DetectOutput {
        findings: new_findings,
        cached_findings: cached_findings_out,
        precomputed,
        findings_by_file,
        graph_wide_findings: graph_wide_findings_out,
        bypass_set,
        stats: DetectStats {
            detectors_run: detectors.len(),
            detectors_skipped: 0,
            gi_findings: 0,
            gd_findings: 0,
            precompute_duration,
        },
    })
}