1use std::collections::HashMap;
8use std::path::Path;
9
10use compact_str::CompactString;
11
12use crate::deviation::{diff_sets, pick_best, Deviation, Differences, GoldenMatch};
13use crate::rule::{GoldenSelector, Rule};
14use crate::similarity::{aggregate, jaccard_sorted, SimilarityScore, Weights};
15use crate::tsed;
16use crate::FunctionRef;
17
18const DEFAULT_THRESHOLD: f64 = 0.7;
19
20#[derive(Debug, thiserror::Error)]
21pub enum PipelineError {
22 #[error("rule `{rule_id}`: golden `{file}:{symbol}` not found in extracted files")]
23 GoldenNotFound { rule_id: String, file: String, symbol: String },
24}
25
26pub type ExtractedFiles = HashMap<String, Vec<FunctionRef>>;
28
29pub fn detect_deviations(
34 rules: &[Rule],
35 files: &ExtractedFiles,
36 threshold_override: Option<f64>,
37) -> Result<Vec<Deviation>, PipelineError> {
38 let weights = Weights::default();
39 let mut out = Vec::new();
40 for rule in rules {
41 if rule.disabled {
42 continue;
43 }
44 let goldens = resolve_goldens(rule, files)?;
45 let threshold = threshold_override
46 .or(rule.threshold)
47 .unwrap_or(DEFAULT_THRESHOLD);
48
49 for (file_path, funcs) in files {
50 if !rule.matches(Path::new(file_path)) {
51 continue;
52 }
53 for func in funcs {
54 if func.ignore.is_some() {
55 continue;
56 }
57 if is_golden(&goldens, file_path, &func.symbol) {
58 continue;
59 }
60 let matches = score_against_all(func, &goldens, weights);
61 let (best, sorted) = pick_best(matches);
62 if best.similarity.overall < threshold {
63 out.push(build_deviation(rule, file_path, func, best, sorted, &goldens));
64 }
65 }
66 }
67 }
68 Ok(out)
69}
70
71fn resolve_goldens<'f>(
72 rule: &Rule,
73 files: &'f ExtractedFiles,
74) -> Result<Vec<(GoldenSelector, &'f FunctionRef)>, PipelineError> {
75 let mut goldens = Vec::with_capacity(rule.goldens.len());
76 for g in &rule.goldens {
77 let funcs = files.get(&g.file).ok_or_else(|| PipelineError::GoldenNotFound {
78 rule_id: rule.id.clone(),
79 file: g.file.clone(),
80 symbol: g.symbol.clone(),
81 })?;
82 let func = funcs
83 .iter()
84 .find(|f| f.symbol.as_str() == g.symbol)
85 .ok_or_else(|| PipelineError::GoldenNotFound {
86 rule_id: rule.id.clone(),
87 file: g.file.clone(),
88 symbol: g.symbol.clone(),
89 })?;
90 goldens.push((g.clone(), func));
91 }
92 Ok(goldens)
93}
94
95fn is_golden(
96 goldens: &[(GoldenSelector, &FunctionRef)],
97 file_path: &str,
98 symbol: &CompactString,
99) -> bool {
100 goldens
101 .iter()
102 .any(|(g, _)| g.file == file_path && g.symbol == symbol.as_str())
103}
104
105fn score_against_all(
106 func: &FunctionRef,
107 goldens: &[(GoldenSelector, &FunctionRef)],
108 weights: Weights,
109) -> Vec<GoldenMatch> {
110 goldens
111 .iter()
112 .map(|(sel, golden_func)| GoldenMatch {
113 golden: sel.clone(),
114 similarity: score_pair(func, golden_func, weights),
115 })
116 .collect()
117}
118
119pub fn score_pair(actual: &FunctionRef, golden: &FunctionRef, weights: Weights) -> SimilarityScore {
123 let shape = tsed::tsed(&actual.tree, &golden.tree);
124 let calls = jaccard_sorted(&actual.calls, &golden.calls);
125 let imports = jaccard_sorted(&actual.imports, &golden.imports);
126 let signature = if actual.signature == golden.signature { 1.0 } else { 0.0 };
127 aggregate(shape, calls, imports, signature, weights)
128}
129
130fn build_deviation(
131 rule: &Rule,
132 file_path: &str,
133 func: &FunctionRef,
134 best: GoldenMatch,
135 sorted: Vec<GoldenMatch>,
136 goldens: &[(GoldenSelector, &FunctionRef)],
137) -> Deviation {
138 let golden_func = goldens
139 .iter()
140 .find(|(g, _)| *g == best.golden)
141 .map(|(_, f)| *f)
142 .expect("matched golden must be in resolved set");
143 let (missing_calls, extra_calls) = diff_sets(&golden_func.calls, &func.calls);
144 let (missing_imports, extra_imports) = diff_sets(&golden_func.imports, &func.imports);
145 Deviation {
146 rule_id: rule.id.clone(),
147 file: file_path.to_string(),
148 symbol: func.symbol.clone(),
149 matched_golden: best.golden.clone(),
150 all_golden_scores: sorted,
151 similarity: best.similarity,
152 differences: Differences {
153 missing_calls,
154 extra_calls,
155 missing_imports,
156 extra_imports,
157 },
158 }
159}