use crate::detectors::base::{Detector, DetectorConfig};
use crate::graph::GraphQueryExt;
use crate::models::{deterministic_finding_id, Finding, Severity};
use anyhow::Result;
use std::collections::HashSet;
use std::path::PathBuf;
use tracing::info;
pub struct LargeFilesDetector {
#[allow(dead_code)] repository_path: PathBuf,
max_findings: usize,
threshold: usize,
default_threshold: usize,
resolver: crate::calibrate::ThresholdResolver,
}
impl LargeFilesDetector {
#[allow(dead_code)] pub fn new(repository_path: impl Into<PathBuf>) -> Self {
Self {
repository_path: repository_path.into(),
max_findings: 50,
threshold: 800,
default_threshold: 800,
resolver: Default::default(),
}
}
pub fn with_resolver(
repository_path: impl Into<PathBuf>,
resolver: &crate::calibrate::ThresholdResolver,
) -> Self {
use crate::calibrate::MetricKind;
let default_threshold = 800usize;
let threshold = resolver.warn_usize(MetricKind::FileLength, default_threshold);
if threshold != default_threshold {
tracing::info!(
"LargeFiles: adaptive threshold {} (default={})",
threshold,
default_threshold
);
}
Self {
repository_path: repository_path.into(),
max_findings: 50,
threshold,
default_threshold,
resolver: resolver.clone(),
}
}
fn analyze_file_structure(
graph: &dyn crate::graph::GraphQuery,
file_path: &str,
) -> FileAnalysis {
let i = graph.interner();
let functions = graph.get_functions_in_file(file_path);
let func_count = functions.len();
let mut importers: HashSet<String> = HashSet::new();
for func in &functions {
for caller in graph.get_callers(func.qn(i)) {
if caller.path(i) != file_path {
importers.insert(caller.path(i).to_string());
}
}
}
let largest_func = functions
.iter()
.map(|f| {
(
f.node_name(i).to_string(),
f.line_end.saturating_sub(f.line_start),
)
})
.max_by_key(|(_, size)| *size);
let mut prefixes: std::collections::BTreeSet<String> = std::collections::BTreeSet::new();
for func in &functions {
if let Some(prefix) = func.node_name(i).split('_').next() {
if prefix.len() > 2 && func.node_name(i).contains('_') {
prefixes.insert(prefix.to_string());
}
}
}
FileAnalysis {
func_count,
importer_count: importers.len(),
largest_func,
potential_modules: prefixes.into_iter().take(5).collect(),
}
}
}
struct FileAnalysis {
func_count: usize,
importer_count: usize,
largest_func: Option<(String, u32)>,
potential_modules: Vec<String>,
}
impl Detector for LargeFilesDetector {
fn name(&self) -> &'static str {
"large-files"
}
fn description(&self) -> &'static str {
"Detects files exceeding size threshold"
}
fn file_extensions(&self) -> &'static [&'static str] {
&[
"py", "js", "ts", "jsx", "tsx", "rb", "java", "go", "rs", "c", "cpp", "cs",
]
}
fn detect(
&self,
ctx: &crate::detectors::analysis_context::AnalysisContext,
) -> Result<Vec<Finding>> {
let graph = ctx.graph;
let files = &ctx.as_file_provider();
let mut findings = vec![];
for path in files.files_with_extensions(&[
"py", "js", "ts", "jsx", "tsx", "rs", "go", "java", "cs", "cpp", "c", "h", "rb", "php",
]) {
if findings.len() >= self.max_findings {
break;
}
let path_str = path.to_string_lossy().to_string();
if path_str.contains("vendor")
|| path_str.contains("node_modules")
|| path_str.contains("generated")
|| path_str.contains(".min.")
{
continue;
}
if let Some(content) = files.content(path) {
let lines = content.lines().count();
if lines > self.threshold {
let analysis = Self::analyze_file_structure(graph, &path_str);
let severity = if lines > 2000 || analysis.importer_count > 10 {
Severity::High
} else if lines > 1000 || analysis.importer_count > 5 {
Severity::Medium
} else {
Severity::Low
};
let mut notes = Vec::new();
notes.push(format!("📏 {} lines", lines));
if analysis.func_count > 0 {
notes.push(format!("📦 {} functions", analysis.func_count));
}
if analysis.importer_count > 0 {
notes.push(format!(
"🔗 {} files depend on this",
analysis.importer_count
));
}
if let Some((name, size)) = &analysis.largest_func {
notes.push(format!("📐 Largest function: `{}` ({} lines)", name, size));
}
let context_notes = format!("\n\n**Analysis:**\n{}", notes.join("\n"));
let suggestion = if !analysis.potential_modules.is_empty() {
format!(
"Consider splitting by function prefix into separate modules:\n\n\
{}\n\n\
```python\n\
# {}_utils.py - extract {}_* functions\n\
# {}_core.py - extract core logic\n\
```",
analysis
.potential_modules
.iter()
.map(|p| format!("• `{}_*` functions → `{}.py`", p, p))
.collect::<Vec<_>>()
.join("\n"),
analysis
.potential_modules
.first()
.unwrap_or(&"module".to_string()),
analysis
.potential_modules
.first()
.unwrap_or(&"module".to_string()),
path.file_stem().and_then(|s| s.to_str()).unwrap_or("file")
)
} else {
"Split into smaller, focused modules. Group related functions together."
.to_string()
};
let effort = if lines > 1000 {
"2-4 hours"
} else {
"1-2 hours"
};
let explanation = self.resolver.explain(
crate::calibrate::MetricKind::FileLength,
lines as f64,
self.default_threshold as f64,
);
let threshold_metadata = explanation.to_metadata().into_iter().collect();
findings.push(Finding {
id: String::new(),
detector: "LargeFilesDetector".to_string(),
severity,
title: format!("Large file: {} lines", lines),
description: format!(
"File exceeds recommended size ({} lines > {} threshold).{}\n\n📊 {}",
lines,
self.threshold,
context_notes,
explanation.to_note()
),
affected_files: vec![path.to_path_buf()],
line_start: Some(1),
line_end: Some(lines as u32),
suggested_fix: Some(suggestion),
estimated_effort: Some(effort.to_string()),
category: Some("maintainability".to_string()),
cwe_id: None,
why_it_matters: Some(if analysis.importer_count > 5 {
"This file is a dependency hub - many other files import from it. \
Large dependency hubs are hard to refactor and create merge conflicts."
.to_string()
} else {
"Large files are harder to understand, test, and maintain. \
They often indicate that the module has too many responsibilities."
.to_string()
}),
threshold_metadata,
..Default::default()
});
}
}
}
findings.sort_by_key(|f| std::cmp::Reverse(f.line_end));
info!(
"LargeFilesDetector found {} findings (graph-aware)",
findings.len()
);
Ok(findings)
}
}
impl crate::detectors::RegisteredDetector for LargeFilesDetector {
fn create(init: &crate::detectors::DetectorInit) -> std::sync::Arc<dyn Detector> {
std::sync::Arc::new(Self::with_resolver(init.repo_path, &init.resolver))
}
fn max_tier() -> crate::models::Tier {
crate::models::Tier::Deep
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::graph::builder::GraphBuilder;
#[test]
fn test_detects_large_file() {
let content: String = (0..850).map(|i| format!("x_{} = {}\n", i, i)).collect();
let store = GraphBuilder::new().freeze();
let detector = LargeFilesDetector::new("/mock/repo");
let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(
&store,
vec![("big_module.py", &content)],
);
let findings = detector.detect(&ctx).expect("detection should succeed");
assert!(
!findings.is_empty(),
"Should detect file with 850 lines (threshold 800). Found: {:?}",
findings.iter().map(|f| &f.title).collect::<Vec<_>>()
);
assert!(
findings[0].title.contains("850"),
"Title should mention line count"
);
}
#[test]
fn test_no_finding_for_small_file() {
let content: String = (0..100).map(|i| format!("x_{} = {}\n", i, i)).collect();
let store = GraphBuilder::new().freeze();
let detector = LargeFilesDetector::new("/mock/repo");
let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(
&store,
vec![("small_module.py", &content)],
);
let findings = detector.detect(&ctx).expect("detection should succeed");
assert!(
findings.is_empty(),
"Should not flag file with 100 lines. Found: {:?}",
findings.iter().map(|f| &f.title).collect::<Vec<_>>()
);
}
}