use crate::calibrate::NgramModel;
use crate::graph::GraphQueryExt;
use crate::detectors::base::Detector;
use crate::models::{Finding, Severity};
use anyhow::Result;
use std::path::{Path, PathBuf};
use tracing::info;
pub struct SurprisalDetector {
repository_path: PathBuf,
model: NgramModel,
max_findings: usize,
}
impl SurprisalDetector {
pub fn new(repository_path: impl Into<PathBuf>, model: NgramModel) -> Self {
Self {
repository_path: repository_path.into(),
model,
max_findings: 30,
}
}
fn analyze_file(
&self,
path: &Path,
content: &str,
graph: &dyn crate::graph::GraphQuery,
baseline_mean: f64,
baseline_std: f64,
) -> Vec<Finding> {
let i = graph.interner();
let mut findings = Vec::new();
let rel_path = path.strip_prefix(&self.repository_path).unwrap_or(path);
let rel_str = rel_path.to_string_lossy();
let lines: Vec<&str> = content.lines().collect();
let all_funcs = graph.get_functions_shared();
let functions: Vec<_> = all_funcs.iter()
.filter(|f| f.path(i) == rel_str.as_ref() || rel_str.ends_with(f.path(i)) || f.path(i).ends_with(&*rel_str))
.collect();
for func in &functions {
let start = func.line_start.saturating_sub(1) as usize;
let end = (func.line_end as usize).min(lines.len());
if start >= end || end - start < 8 {
continue; }
let func_lines = &lines[start..end];
let prev_before_func = if start > 0 { lines.get(start - 1).copied() } else { None };
let suppressed = func_lines.iter().enumerate().any(|(i, line)| {
let prev = if i > 0 { Some(func_lines[i - 1]) } else { prev_before_func };
super::is_line_suppressed_for(line, prev, "surprisal")
});
if suppressed {
continue;
}
let (avg_surprisal, max_surprisal, peak_line) = self.model.function_surprisal(func_lines);
if avg_surprisal <= 0.0 {
continue;
}
let z_score = if baseline_std > 0.0 {
(avg_surprisal - baseline_mean) / baseline_std
} else {
0.0
};
if z_score < 2.0 {
continue;
}
let severity = if z_score > 3.5 {
Severity::High
} else if z_score > 2.5 {
Severity::Medium
} else {
Severity::Low
};
let _peak_line_num = start + peak_line + 1;
let peak_content = func_lines.get(peak_line)
.map(|l| l.trim())
.unwrap_or("")
.chars().take(80).collect::<String>();
findings.push(Finding {
id: String::new(),
detector: "SurprisalDetector".to_string(),
severity,
title: format!(
"Unusual code pattern in `{}`",
func.node_name(i)
),
description: format!(
"Function `{}` has unusually high surprisal ({:.1} bits, project mean: {:.1}, z-score: {:.1}).\n\n\
This code doesn't match the typical patterns in this project. \
Most surprising line ({:.1} bits):\n```\n{}\n```\n\n\
**Possible causes:**\n\
- AI-generated code with different style\n\
- Copy-pasted from a different codebase\n\
- Unusual algorithm or pattern\n\
- Potential bug (buggy code tends to be more surprising)",
func.node_name(i), avg_surprisal, baseline_mean, z_score,
max_surprisal, peak_content
),
affected_files: vec![path.to_path_buf()],
line_start: Some(func.line_start),
line_end: Some(func.line_end),
suggested_fix: Some(
"Review this function for:\n\
1. Style consistency with the rest of the project\n\
2. Correctness — unusual patterns may indicate bugs\n\
3. If AI-generated, verify it does what you expect".to_string()
),
estimated_effort: Some("15 minutes".to_string()),
category: Some("ai-quality".to_string()),
cwe_id: None,
why_it_matters: Some(format!(
"Research shows that buggy code lines have significantly higher entropy \
than correct code (Ray & Hellendoorn, 2015). This function's token patterns \
are in the top {:.1}% most unusual in this project.",
(1.0 - normal_cdf(z_score)) * 100.0
)),
threshold_metadata: [
("threshold_source".to_string(), "predictive".to_string()),
("surprisal_bits".to_string(), format!("{:.2}", avg_surprisal)),
("baseline_mean".to_string(), format!("{:.2}", baseline_mean)),
("baseline_std".to_string(), format!("{:.2}", baseline_std)),
("z_score".to_string(), format!("{:.2}", z_score)),
].into_iter().collect(),
..Default::default()
});
}
findings
}
}
impl Detector for SurprisalDetector {
fn name(&self) -> &'static str {
"surprisal"
}
fn description(&self) -> &'static str {
"Detects statistically unusual code patterns using predictive coding"
}
fn detect(&self, ctx: &crate::detectors::analysis_context::AnalysisContext) -> Result<Vec<Finding>> {
let graph = ctx.graph;
let files = &ctx.as_file_provider();
let i = graph.interner();
if !self.model.is_confident() {
info!(
"SurprisalDetector: skipping analysis — n-gram model is not confident \
(insufficient training data). Run calibration on a larger codebase to enable."
);
return Ok(vec![]);
}
let mut all_surprisals = Vec::new();
let mut file_data: Vec<(PathBuf, String)> = Vec::new();
for path in files.files_with_extensions(&["rs", "py", "ts", "tsx", "js", "jsx", "go", "java",
"c", "cpp", "cc", "h", "hpp", "cs", "kt"]) {
if crate::detectors::content_classifier::is_non_production_path(
&path.to_string_lossy()
) {
continue;
}
if let Some(content) = files.content(path) {
let lines: Vec<&str> = content.lines().collect();
let rel_path = path.strip_prefix(&self.repository_path).unwrap_or(path);
let rel_str = rel_path.to_string_lossy();
let all_funcs = graph.get_functions_shared();
let functions: Vec<_> = all_funcs.iter()
.filter(|f| f.path(i) == rel_str.as_ref() || rel_str.ends_with(f.path(i)) || f.path(i).ends_with(&*rel_str))
.collect();
for func in &functions {
let start = func.line_start.saturating_sub(1) as usize;
let end = (func.line_end as usize).min(lines.len());
if start >= end || end - start < 8 { continue; }
let func_lines = &lines[start..end];
let (avg, _, _) = self.model.function_surprisal(func_lines);
if avg > 0.0 {
all_surprisals.push(avg);
}
}
file_data.push((path.to_path_buf(), content.to_string()));
}
}
if all_surprisals.len() < 20 {
info!("SurprisalDetector: not enough functions to establish baseline ({})", all_surprisals.len());
return Ok(vec![]);
}
let n = all_surprisals.len() as f64;
let mean = all_surprisals.iter().sum::<f64>() / n;
let variance = all_surprisals.iter().map(|s| (s - mean).powi(2)).sum::<f64>() / n;
let std = variance.sqrt();
info!(
"SurprisalDetector baseline: mean={:.2} bits, std={:.2}, n={} functions",
mean, std, all_surprisals.len()
);
let mut findings = Vec::new();
for (path, content) in &file_data {
if findings.len() >= self.max_findings { break; }
let mut file_findings = self.analyze_file(path, content, graph, mean, std);
findings.append(&mut file_findings);
}
findings.sort_by(|a, b| {
let za = a.threshold_metadata.get("z_score")
.and_then(|s| s.parse::<f64>().ok()).unwrap_or(0.0);
let zb = b.threshold_metadata.get("z_score")
.and_then(|s| s.parse::<f64>().ok()).unwrap_or(0.0);
zb.partial_cmp(&za).unwrap_or(std::cmp::Ordering::Equal)
});
findings.truncate(self.max_findings);
info!("SurprisalDetector found {} unusual functions", findings.len());
Ok(findings)
}
}
fn normal_cdf(z: f64) -> f64 {
0.5 * (1.0 + erf(z / std::f64::consts::SQRT_2))
}
fn erf(x: f64) -> f64 {
let a1 = 0.254829592;
let a2 = -0.284496736;
let a3 = 1.421413741;
let a4 = -1.453152027;
let a5 = 1.061405429;
let coeff_p = 0.3275911;
let sign = if x < 0.0 { -1.0 } else { 1.0 };
let x = x.abs();
let t = 1.0 / (1.0 + coeff_p * x);
let y = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * (-x * x).exp();
sign * y
}
impl super::RegisteredDetector for SurprisalDetector {
fn create(init: &super::DetectorInit) -> std::sync::Arc<dyn Detector> {
let model = init.ngram_model.cloned().unwrap_or_default();
std::sync::Arc::new(Self::new(init.repo_path, model))
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::calibrate::NgramModel;
use crate::graph::builder::GraphBuilder;
#[test]
fn test_normal_cdf_known_values() {
let cdf_zero = normal_cdf(0.0);
assert!(
(cdf_zero - 0.5).abs() < 1e-6,
"normal_cdf(0) should be 0.5, got {}",
cdf_zero
);
let cdf_large = normal_cdf(4.0);
assert!(
cdf_large > 0.99,
"normal_cdf(4.0) should be > 0.99, got {}",
cdf_large
);
let cdf_neg = normal_cdf(-4.0);
assert!(
cdf_neg < 0.01,
"normal_cdf(-4.0) should be < 0.01, got {}",
cdf_neg
);
}
#[test]
fn test_non_confident_model_returns_empty() {
let dir = tempfile::tempdir().expect("should create temp dir");
let file = dir.path().join("module.py");
std::fs::write(
&file,
r#"
def foo():
return 42
"#,
)
.expect("should write test file");
let model = NgramModel::new(); assert!(!model.is_confident());
let store = GraphBuilder::new().freeze();
let detector = SurprisalDetector::new(dir.path(), model);
let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(&store, vec![]);
let findings = detector.detect(&ctx).expect("detection should succeed");
assert!(
findings.is_empty(),
"Non-confident model should produce no findings, but got: {:?}",
findings.iter().map(|f| &f.title).collect::<Vec<_>>()
);
}
}