#![allow(dead_code)]
use crate::detectors::base::{Detector, DetectorConfig};
use crate::graph::GraphQueryExt;
use crate::models::{Finding, Severity};
use anyhow::Result;
use std::collections::HashMap;
use std::path::PathBuf;
use tracing::{debug, info};
const DEFAULT_WINDOW_DAYS: i64 = 30;
const DEFAULT_Z_SCORE_THRESHOLD: f64 = 3.0;
const DEFAULT_SPIKE_BEFORE_MAX: u32 = 5;
const DEFAULT_SPIKE_AFTER_MIN: u32 = 15;
const DEFAULT_MAX_FINDINGS: usize = 50;
const DEFAULT_MIN_COMPLEXITY: i64 = 20;
const GENERIC_NAMES: &[&str] = &[
"process",
"handle",
"execute",
"run",
"do",
"go",
"work",
"func",
"helper",
"impl",
"inner",
"main_logic",
"do_something",
"do_work",
"process_data",
];
fn is_generic_name(name: &str) -> bool {
let lower = name.to_lowercase();
GENERIC_NAMES
.iter()
.any(|g| lower == *g || lower.starts_with(&format!("{}_", g)))
|| (lower.starts_with("func")
&& lower.len() > 4
&& lower[4..].chars().all(|c| c.is_ascii_digit()))
}
#[derive(Debug, Clone)]
pub struct CodebaseBaseline {
pub total_functions: usize,
pub median_complexity: f64,
pub mean_complexity: f64,
pub stddev_complexity: f64,
pub min_complexity: u32,
pub max_complexity: u32,
pub p75_complexity: f64,
pub p90_complexity: f64,
}
impl CodebaseBaseline {
pub fn z_score(&self, complexity: u32) -> f64 {
if self.stddev_complexity == 0.0 {
return 0.0;
}
(complexity as f64 - self.median_complexity) / self.stddev_complexity
}
pub fn is_outlier(&self, complexity: u32, threshold: f64) -> bool {
self.z_score(complexity) > threshold
}
}
impl Default for CodebaseBaseline {
fn default() -> Self {
Self {
total_functions: 0,
median_complexity: 0.0,
mean_complexity: 0.0,
stddev_complexity: 1.0, min_complexity: 0,
max_complexity: 0,
p75_complexity: 0.0,
p90_complexity: 0.0,
}
}
}
#[derive(Debug, Clone)]
pub struct ComplexitySpike {
pub file_path: String,
pub function_name: String,
pub qualified_name: String,
pub current_complexity: u32,
pub previous_complexity: u32,
pub complexity_delta: i32,
pub z_score: f64,
pub spike_date: Option<String>,
pub commit_sha: String,
pub commit_message: String,
pub author: String,
pub line_number: u32,
pub baseline_median: f64,
pub baseline_stddev: f64,
}
pub struct AIComplexitySpikeDetector {
config: DetectorConfig,
window_days: i64,
z_score_threshold: f64,
spike_before_max: u32,
spike_after_min: u32,
max_findings: usize,
}
impl AIComplexitySpikeDetector {
pub fn new() -> Self {
Self {
config: DetectorConfig::new(),
window_days: DEFAULT_WINDOW_DAYS,
z_score_threshold: DEFAULT_Z_SCORE_THRESHOLD,
spike_before_max: DEFAULT_SPIKE_BEFORE_MAX,
spike_after_min: DEFAULT_SPIKE_AFTER_MIN,
max_findings: DEFAULT_MAX_FINDINGS,
}
}
pub fn with_config(config: DetectorConfig) -> Self {
Self {
window_days: config.get_option_or("window_days", DEFAULT_WINDOW_DAYS),
z_score_threshold: config.get_option_or("z_score_threshold", DEFAULT_Z_SCORE_THRESHOLD),
spike_before_max: config.get_option_or("spike_before_max", DEFAULT_SPIKE_BEFORE_MAX),
spike_after_min: config.get_option_or("spike_after_min", DEFAULT_SPIKE_AFTER_MIN),
max_findings: config.get_option_or("max_findings", DEFAULT_MAX_FINDINGS),
config,
}
}
fn compute_baseline(&self, complexities: &[u32]) -> CodebaseBaseline {
if complexities.is_empty() {
return CodebaseBaseline::default();
}
let mut sorted = complexities.to_vec();
sorted.sort();
let n = sorted.len();
let sum: u64 = sorted.iter().map(|&c| c as u64).sum();
let mean = sum as f64 / n as f64;
let median = if n.is_multiple_of(2) {
(sorted[n / 2 - 1] as f64 + sorted[n / 2] as f64) / 2.0
} else {
sorted[n / 2] as f64
};
let variance: f64 = sorted
.iter()
.map(|&c| {
let diff = c as f64 - mean;
diff * diff
})
.sum::<f64>()
/ n as f64;
let stddev = variance.sqrt().max(1.0);
let p75_idx = (n as f64 * 0.75) as usize;
let p90_idx = (n as f64 * 0.90) as usize;
CodebaseBaseline {
total_functions: n,
median_complexity: median,
mean_complexity: mean,
stddev_complexity: stddev,
min_complexity: sorted[0],
max_complexity: sorted[n - 1],
p75_complexity: sorted.get(p75_idx).copied().unwrap_or(sorted[n - 1]) as f64,
p90_complexity: sorted.get(p90_idx).copied().unwrap_or(sorted[n - 1]) as f64,
}
}
fn create_finding(&self, spike: &ComplexitySpike, baseline: &CodebaseBaseline) -> Finding {
let severity = if spike.z_score >= 2.5 || spike.complexity_delta >= 15 {
Severity::High
} else {
Severity::Medium
};
let title = if spike.previous_complexity > 0 {
format!(
"Function {} jumped from complexity {} to {} in commit {}",
spike.function_name,
spike.previous_complexity,
spike.current_complexity,
&spike.commit_sha[..7.min(spike.commit_sha.len())]
)
} else {
format!("Complexity Outlier: {}", spike.function_name)
};
let description = self.build_description(spike, baseline);
let suggested_fix = self.build_suggested_fix(spike);
Finding {
id: String::new(),
detector: "AIComplexitySpikeDetector".to_string(),
severity,
title,
description,
affected_files: vec![PathBuf::from(&spike.file_path)],
line_start: Some(spike.line_number),
line_end: None,
suggested_fix: Some(suggested_fix),
estimated_effort: Some(self.estimate_effort(spike)),
category: Some("complexity".to_string()),
cwe_id: None,
why_it_matters: Some(format!(
"This function's complexity ({}) is {:.1} standard deviations above the \
codebase median ({:.1}). Such complexity outliers often indicate \
features added without proper decomposition.",
spike.current_complexity, spike.z_score, spike.baseline_median
)),
..Default::default()
}
}
fn build_description(&self, spike: &ComplexitySpike, _baseline: &CodebaseBaseline) -> String {
let mut desc = format!(
"Function **{}** is a statistical complexity outlier.\n\n",
spike.function_name
);
desc.push_str("### Complexity Analysis (Baseline Comparison)\n\n");
desc.push_str("| Metric | Value |\n");
desc.push_str("|--------|-------|\n");
desc.push_str(&format!(
"| Previous complexity | {} |\n",
spike.previous_complexity
));
desc.push_str(&format!(
"| Current complexity | {} |\n",
spike.current_complexity
));
desc.push_str(&format!("| Delta | +{} |\n", spike.complexity_delta));
desc.push_str(&format!(
"| Codebase median | {:.1} |\n",
spike.baseline_median
));
desc.push_str(&format!(
"| Codebase stddev | {:.1} |\n",
spike.baseline_stddev
));
desc.push_str(&format!(
"| **Z-score** | **{:.2}** (>{} = outlier) |\n\n",
spike.z_score, self.z_score_threshold
));
desc.push_str("### Commit Details\n\n");
if let Some(ref date) = spike.spike_date {
desc.push_str(&format!("- **When**: {}\n", date));
}
desc.push_str(&format!(
"- **Commit**: `{}`\n",
&spike.commit_sha[..8.min(spike.commit_sha.len())]
));
desc.push_str(&format!("- **Message**: {}\n", spike.commit_message));
desc.push_str(&format!("- **Author**: {}\n", spike.author));
desc.push_str(&format!(
"- **Location**: `{}` line {}\n\n",
spike.file_path, spike.line_number
));
desc.push_str("### Why This Matters\n\n");
desc.push_str(&format!(
"This function's complexity is {:.1}σ above the codebase average. ",
spike.z_score
));
desc.push_str("Statistical outliers in complexity often indicate:\n");
desc.push_str("- Features added without decomposing into smaller functions\n");
desc.push_str("- Technical debt that will compound over time\n");
desc.push_str("- Reduced testability and higher bug risk\n");
desc
}
fn build_suggested_fix(&self, spike: &ComplexitySpike) -> String {
let target_complexity = (spike.baseline_median + spike.baseline_stddev) as u32;
format!(
"1. **Review commit `{}`** to understand what changed\n\n\
2. **Decompose the function** using these patterns:\n\
- Extract Method: Move logical blocks into separate functions\n\
- Replace Conditional with Polymorphism (for branching logic)\n\
- Introduce Parameter Object (for many parameters)\n\n\
3. **Target complexity**: Reduce from {} to below {} (1σ above median)\n\n\
4. **Add tests** before refactoring to catch regressions",
&spike.commit_sha[..8.min(spike.commit_sha.len())],
spike.current_complexity,
target_complexity
)
}
fn estimate_effort(&self, spike: &ComplexitySpike) -> String {
if spike.current_complexity < 20 {
"Small (1-2 hours)".to_string()
} else if spike.current_complexity < 30 {
"Medium (half day)".to_string()
} else if spike.current_complexity < 50 {
"Large (1 day)".to_string()
} else {
"Extra Large (2+ days)".to_string()
}
}
fn has_runtime_prefix(func_name: &str) -> bool {
if let Some(underscore_pos) = func_name.find('_') {
if (2..=4).contains(&underscore_pos) {
let prefix = &func_name[..underscore_pos];
if prefix.chars().all(|c| c.is_alphanumeric()) {
let prefix_lower = prefix.to_lowercase();
const COMMON_WORDS: &[&str] = &[
"get", "set", "is", "do", "can", "has", "new", "old", "add", "del", "pop",
"put", "run", "try", "end", "use", "for", "the", "and", "not", "dead",
"live", "test", "mock", "fake", "stub", "temp", "tmp", "foo", "bar", "baz",
"qux", "call", "read", "load", "save", "send", "recv",
];
if !COMMON_WORDS.contains(&prefix_lower.as_str()) {
return true;
}
}
}
}
false
}
}
impl Default for AIComplexitySpikeDetector {
fn default() -> Self {
Self::new()
}
}
impl Detector for AIComplexitySpikeDetector {
fn name(&self) -> &'static str {
"AIComplexitySpikeDetector"
}
fn description(&self) -> &'static str {
"Detects complexity outliers using research-backed baseline comparison with compound signals"
}
fn category(&self) -> &'static str {
"ai_generated"
}
fn requires_graph(&self) -> bool {
true
}
fn config(&self) -> Option<&DetectorConfig> {
Some(&self.config)
}
fn file_extensions(&self) -> &'static [&'static str] {
&[
"py", "js", "ts", "jsx", "tsx", "java", "go", "rs", "c", "cpp", "cs",
]
}
fn detect(
&self,
ctx: &crate::detectors::analysis_context::AnalysisContext,
) -> Result<Vec<Finding>> {
let graph = ctx.graph;
let i = graph.interner();
use std::collections::HashSet;
let mut findings = Vec::new();
let functions = graph.get_functions_shared();
let complexities: Vec<i64> = functions
.iter()
.filter_map(|f| f.complexity_opt())
.collect();
if complexities.is_empty() {
return Ok(vec![]);
}
let avg: f64 = complexities.iter().sum::<i64>() as f64 / complexities.len() as f64;
let variance: f64 = complexities
.iter()
.map(|&c| (c as f64 - avg).powi(2))
.sum::<f64>()
/ complexities.len() as f64;
let std_dev = variance.sqrt();
let threshold = avg + DEFAULT_Z_SCORE_THRESHOLD * std_dev;
let mut skip_files: HashSet<String> = HashSet::new();
let mut compiler_files: HashSet<String> = HashSet::new();
{
let mut seen_files: HashSet<String> = HashSet::new();
for func in functions.iter() {
if !seen_files.insert(func.path(i).to_string()) {
continue; }
let fp = func.path(i);
if fp.contains("/detectors/")
|| fp.contains("/parsers/")
|| fp.contains("/runtime/")
|| fp.contains("/vm/")
|| fp.contains("/interpreter/")
|| fp.contains("/bytecode/")
|| fp.contains("/jets/")
|| fp.contains("/opcodes/")
|| fp.contains("/noun/")
|| fp.contains("/ext/")
|| fp.contains("/vendor/")
|| fp.contains("/reconciler/")
|| fp.contains("/scheduler/")
|| fp.contains("/react-dom/")
|| fp.contains("/react-server/")
|| fp.contains("/shared/")
|| fp.contains("packages/react")
|| fp.contains("/forks/")
|| fp.contains("/fiber/")
|| crate::detectors::content_classifier::is_non_production_path(fp)
|| crate::detectors::content_classifier::is_likely_bundled_path(fp)
{
skip_files.insert(func.path(i).to_string());
continue;
}
if crate::detectors::content_classifier::is_compiler_code_path(fp) {
compiler_files.insert(func.path(i).to_string());
}
if let Some(content) =
crate::cache::global_cache().content(std::path::Path::new(fp))
{
if crate::detectors::content_classifier::is_bundled_code(&content)
|| crate::detectors::content_classifier::is_minified_code(&content)
|| crate::detectors::content_classifier::is_fixture_code(fp, &content)
{
skip_files.insert(func.path(i).to_string());
}
}
}
}
for func in functions.iter() {
if skip_files.contains(func.path(i)) {
continue;
}
let mut is_ast_code = compiler_files.contains(func.path(i));
if !is_ast_code {
if let Some(content) =
crate::cache::global_cache().content(std::path::Path::new(func.path(i)))
{
is_ast_code = crate::detectors::content_classifier::is_ast_manipulation_code(
func.node_name(i),
&content,
);
}
}
if Self::has_runtime_prefix(func.node_name(i)) {
continue;
}
if let Some(complexity) = func.complexity_opt() {
let effective_threshold = if is_ast_code {
threshold * 1.5
} else {
threshold
};
let min_complexity: i64 = if is_ast_code {
35
} else {
DEFAULT_MIN_COMPLEXITY
};
if complexity as f64 > effective_threshold && complexity > min_complexity {
let z_score = if std_dev > 0.0 {
(complexity as f64 - avg) / std_dev
} else {
0.0
};
let func_name = func.node_name(i);
let file_path = func.path(i);
let qualified_name = func.qn(i);
let mut compound_signals = 0;
if is_generic_name(func_name) {
compound_signals += 1;
}
if ctx.is_high_churn_file(file_path) {
compound_signals += 1;
}
let fan_out = graph.call_fan_out(qualified_name);
if fan_out >= 5 {
compound_signals += 1;
}
if compound_signals < 2 {
continue; }
let severity = match compound_signals {
3 => Severity::High,
_ => Severity::Medium, };
findings.push(Finding {
id: String::new(),
detector: "AIComplexitySpikeDetector".to_string(),
severity,
title: format!("Complexity Outlier: {}", func_name),
description: format!(
"Function '{}' is a statistical complexity outlier (z-score: {:.1}, complexity: {}).",
func_name, z_score, complexity
),
affected_files: vec![file_path.to_string().into()],
line_start: Some(func.line_start),
line_end: Some(func.line_end),
suggested_fix: Some("Review and refactor - consider breaking into smaller functions".to_string()),
estimated_effort: Some("Medium (1-2 hours)".to_string()),
category: Some("complexity".to_string()),
cwe_id: None,
why_it_matters: Some("Complexity outliers indicate functions that need review and decomposition".to_string()),
..Default::default()
});
}
}
}
Ok(findings)
}
}
impl crate::detectors::RegisteredDetector for AIComplexitySpikeDetector {
fn create(_init: &crate::detectors::DetectorInit) -> std::sync::Arc<dyn Detector> {
std::sync::Arc::new(Self::new())
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::graph::CodeNode;
#[test]
fn test_compute_baseline() {
let detector = AIComplexitySpikeDetector::new();
let complexities = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
let baseline = detector.compute_baseline(&complexities);
assert_eq!(baseline.total_functions, 10);
assert!((baseline.median_complexity - 5.5).abs() < 0.01);
assert_eq!(baseline.min_complexity, 1);
assert_eq!(baseline.max_complexity, 10);
}
#[test]
fn test_z_score() {
let baseline = CodebaseBaseline {
total_functions: 100,
median_complexity: 5.0,
mean_complexity: 5.0,
stddev_complexity: 2.0,
min_complexity: 1,
max_complexity: 20,
p75_complexity: 7.0,
p90_complexity: 10.0,
};
let z = baseline.z_score(9);
assert!((z - 2.0).abs() < 0.01);
assert!(baseline.is_outlier(9, 1.9));
assert!(!baseline.is_outlier(9, 2.1));
}
#[test]
fn test_empty_baseline() {
let detector = AIComplexitySpikeDetector::new();
let baseline = detector.compute_baseline(&[]);
assert_eq!(baseline.total_functions, 0);
assert_eq!(baseline.stddev_complexity, 1.0); }
#[test]
fn test_generic_name_detection() {
assert!(is_generic_name("process"));
assert!(is_generic_name("handle_request"));
assert!(is_generic_name("execute"));
assert!(is_generic_name("func1"));
assert!(is_generic_name("do_work"));
assert!(!is_generic_name("parse_http_header"));
assert!(!is_generic_name("validate_user_input"));
assert!(!is_generic_name("flush"));
}
#[test]
fn test_detects_complexity_outlier() {
use crate::graph::CodeEdge;
let mut store = crate::graph::GraphBuilder::new();
for i in 0..10 {
let complexity = 3 + (i % 3); let node = CodeNode::function(&format!("normal_func_{}", i), "/src/app.py")
.with_qualified_name(&format!("app.normal_func_{}", i))
.with_lines(1, 20)
.with_property("complexity", complexity as i64);
store.add_node(node);
}
for i in 0..6 {
let helper = CodeNode::function(&format!("helper_{}", i), "/src/app.py")
.with_qualified_name(&format!("app.helper_{}", i))
.with_lines(1, 10)
.with_property("complexity", 2_i64);
store.add_node(helper);
}
let outlier = CodeNode::function("process_data", "/src/app.py")
.with_qualified_name("app.process_data")
.with_lines(1, 200)
.with_property("complexity", 50_i64);
store.add_node(outlier);
for i in 0..6 {
store.add_edge_by_name(
"app.process_data",
&format!("app.helper_{}", i),
CodeEdge::calls(),
);
}
let detector = AIComplexitySpikeDetector::new();
let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(
&store,
vec![],
);
let findings = detector
.detect(&ctx)
.expect("should detect complexity outlier");
assert!(
!findings.is_empty(),
"Should detect the complexity outlier (50 vs avg ~4) with 2+ compound signals"
);
assert!(
findings.iter().any(|f| f.title.contains("process_data")),
"Finding should reference the outlier function, got: {:?}",
findings.iter().map(|f| &f.title).collect::<Vec<_>>()
);
}
#[test]
fn test_no_finding_for_normal_complexity() {
let mut store = crate::graph::GraphBuilder::new();
for i in 0..20 {
let complexity = 3 + (i % 5); let node = CodeNode::function(&format!("func_{}", i), "/src/app.py")
.with_qualified_name(&format!("app.func_{}", i))
.with_lines(1, 30)
.with_property("complexity", complexity as i64);
store.add_node(node);
}
let detector = AIComplexitySpikeDetector::new();
let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(
&store,
vec![],
);
let findings = detector
.detect(&ctx)
.expect("should detect normal complexity");
assert!(
findings.is_empty(),
"Should not flag any function when all complexities are normal (3-7), but got: {:?}",
findings.iter().map(|f| &f.title).collect::<Vec<_>>()
);
}
}