repotoire 0.8.3

Graph-powered code analysis CLI. 110 detectors for security, architecture, bus factor, and code quality.
Documentation
//! Large Files Detector
//!
//! Graph-enhanced detection of overly large files:
//! - Count functions and classes in the file
//! - Analyze coupling (how many other files depend on this)
//! - Suggest split points based on function groupings

use crate::detectors::base::{Detector, DetectorConfig};
use crate::graph::GraphQueryExt;
use crate::models::{deterministic_finding_id, Finding, Severity};
use anyhow::Result;
use std::collections::HashSet;
use std::path::PathBuf;
use tracing::info;

pub struct LargeFilesDetector {
    #[allow(dead_code)] // Part of detector pattern, used for file scanning
    repository_path: PathBuf,
    max_findings: usize,
    threshold: usize,
    default_threshold: usize,
    resolver: crate::calibrate::ThresholdResolver,
}

impl LargeFilesDetector {
    #[allow(dead_code)] // Constructor used by tests and detector registration
    pub fn new(repository_path: impl Into<PathBuf>) -> Self {
        Self {
            repository_path: repository_path.into(),
            max_findings: 50,
            threshold: 800,
            default_threshold: 800,
            resolver: Default::default(),
        }
    }

    /// Create with adaptive threshold resolver
    pub fn with_resolver(
        repository_path: impl Into<PathBuf>,
        resolver: &crate::calibrate::ThresholdResolver,
    ) -> Self {
        use crate::calibrate::MetricKind;
        let default_threshold = 800usize;
        let threshold = resolver.warn_usize(MetricKind::FileLength, default_threshold);
        if threshold != default_threshold {
            tracing::info!(
                "LargeFiles: adaptive threshold {} (default={})",
                threshold,
                default_threshold
            );
        }
        Self {
            repository_path: repository_path.into(),
            max_findings: 50,
            threshold,
            default_threshold,
            resolver: resolver.clone(),
        }
    }

    /// Analyze file structure using graph
    fn analyze_file_structure(
        graph: &dyn crate::graph::GraphQuery,
        file_path: &str,
    ) -> FileAnalysis {
        let i = graph.interner();
        let functions = graph.get_functions_in_file(file_path);

        let func_count = functions.len();

        // Count unique files that import from this file
        let mut importers: HashSet<String> = HashSet::new();
        for func in &functions {
            for caller in graph.get_callers(func.qn(i)) {
                if caller.path(i) != file_path {
                    importers.insert(caller.path(i).to_string());
                }
            }
        }

        // Find the largest function
        let largest_func = functions
            .iter()
            .map(|f| {
                (
                    f.node_name(i).to_string(),
                    f.line_end.saturating_sub(f.line_start),
                )
            })
            .max_by_key(|(_, size)| *size);

        // Group functions by prefix to suggest split points
        // BTreeSet for deterministic ordering in output
        let mut prefixes: std::collections::BTreeSet<String> = std::collections::BTreeSet::new();
        for func in &functions {
            if let Some(prefix) = func.node_name(i).split('_').next() {
                if prefix.len() > 2 && func.node_name(i).contains('_') {
                    prefixes.insert(prefix.to_string());
                }
            }
        }

        FileAnalysis {
            func_count,
            importer_count: importers.len(),
            largest_func,
            potential_modules: prefixes.into_iter().take(5).collect(),
        }
    }
}

struct FileAnalysis {
    func_count: usize,
    importer_count: usize,
    largest_func: Option<(String, u32)>,
    potential_modules: Vec<String>,
}

impl Detector for LargeFilesDetector {
    fn name(&self) -> &'static str {
        "large-files"
    }
    fn description(&self) -> &'static str {
        "Detects files exceeding size threshold"
    }

    fn file_extensions(&self) -> &'static [&'static str] {
        &[
            "py", "js", "ts", "jsx", "tsx", "rb", "java", "go", "rs", "c", "cpp", "cs",
        ]
    }

    fn detect(
        &self,
        ctx: &crate::detectors::analysis_context::AnalysisContext,
    ) -> Result<Vec<Finding>> {
        let graph = ctx.graph;
        let files = &ctx.as_file_provider();
        let mut findings = vec![];

        for path in files.files_with_extensions(&[
            "py", "js", "ts", "jsx", "tsx", "rs", "go", "java", "cs", "cpp", "c", "h", "rb", "php",
        ]) {
            if findings.len() >= self.max_findings {
                break;
            }

            let path_str = path.to_string_lossy().to_string();

            // Skip vendor/generated
            if path_str.contains("vendor")
                || path_str.contains("node_modules")
                || path_str.contains("generated")
                || path_str.contains(".min.")
            {
                continue;
            }

            if let Some(content) = files.content(path) {
                let lines = content.lines().count();
                if lines > self.threshold {
                    let analysis = Self::analyze_file_structure(graph, &path_str);

                    // Calculate severity based on size and coupling
                    let severity = if lines > 2000 || analysis.importer_count > 10 {
                        Severity::High
                    } else if lines > 1000 || analysis.importer_count > 5 {
                        Severity::Medium
                    } else {
                        Severity::Low
                    };

                    // Build notes
                    let mut notes = Vec::new();
                    notes.push(format!("📏 {} lines", lines));
                    if analysis.func_count > 0 {
                        notes.push(format!("📦 {} functions", analysis.func_count));
                    }
                    if analysis.importer_count > 0 {
                        notes.push(format!(
                            "🔗 {} files depend on this",
                            analysis.importer_count
                        ));
                    }
                    if let Some((name, size)) = &analysis.largest_func {
                        notes.push(format!("📐 Largest function: `{}` ({} lines)", name, size));
                    }

                    let context_notes = format!("\n\n**Analysis:**\n{}", notes.join("\n"));

                    // Build split suggestion
                    let suggestion = if !analysis.potential_modules.is_empty() {
                        format!(
                            "Consider splitting by function prefix into separate modules:\n\n\
                             {}\n\n\
                             ```python\n\
                             # {}_utils.py - extract {}_* functions\n\
                             # {}_core.py - extract core logic\n\
                             ```",
                            analysis
                                .potential_modules
                                .iter()
                                .map(|p| format!("• `{}_*` functions → `{}.py`", p, p))
                                .collect::<Vec<_>>()
                                .join("\n"),
                            analysis
                                .potential_modules
                                .first()
                                .unwrap_or(&"module".to_string()),
                            analysis
                                .potential_modules
                                .first()
                                .unwrap_or(&"module".to_string()),
                            path.file_stem().and_then(|s| s.to_str()).unwrap_or("file")
                        )
                    } else {
                        "Split into smaller, focused modules. Group related functions together."
                            .to_string()
                    };

                    let effort = if lines > 1000 {
                        "2-4 hours"
                    } else {
                        "1-2 hours"
                    };

                    let explanation = self.resolver.explain(
                        crate::calibrate::MetricKind::FileLength,
                        lines as f64,
                        self.default_threshold as f64,
                    );
                    let threshold_metadata = explanation.to_metadata().into_iter().collect();

                    findings.push(Finding {
                        id: String::new(),
                        detector: "LargeFilesDetector".to_string(),
                        severity,
                        title: format!("Large file: {} lines", lines),
                        description: format!(
                            "File exceeds recommended size ({} lines > {} threshold).{}\n\n📊 {}",
                            lines,
                            self.threshold,
                            context_notes,
                            explanation.to_note()
                        ),
                        affected_files: vec![path.to_path_buf()],
                        line_start: Some(1),
                        line_end: Some(lines as u32),
                        suggested_fix: Some(suggestion),
                        estimated_effort: Some(effort.to_string()),
                        category: Some("maintainability".to_string()),
                        cwe_id: None,
                        why_it_matters: Some(if analysis.importer_count > 5 {
                            "This file is a dependency hub - many other files import from it. \
                             Large dependency hubs are hard to refactor and create merge conflicts."
                                .to_string()
                        } else {
                            "Large files are harder to understand, test, and maintain. \
                             They often indicate that the module has too many responsibilities."
                                .to_string()
                        }),
                        threshold_metadata,
                        ..Default::default()
                    });
                }
            }
        }

        // Sort by line count (largest first)
        findings.sort_by_key(|f| std::cmp::Reverse(f.line_end));

        info!(
            "LargeFilesDetector found {} findings (graph-aware)",
            findings.len()
        );
        Ok(findings)
    }
}

impl crate::detectors::RegisteredDetector for LargeFilesDetector {
    fn create(init: &crate::detectors::DetectorInit) -> std::sync::Arc<dyn Detector> {
        std::sync::Arc::new(Self::with_resolver(init.repo_path, &init.resolver))
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::graph::builder::GraphBuilder;

    #[test]
    fn test_detects_large_file() {
        // Default threshold is 800 lines; write 850
        let content: String = (0..850).map(|i| format!("x_{} = {}\n", i, i)).collect();

        let store = GraphBuilder::new().freeze();
        let detector = LargeFilesDetector::new("/mock/repo");
        let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(
            &store,
            vec![("big_module.py", &content)],
        );
        let findings = detector.detect(&ctx).expect("detection should succeed");
        assert!(
            !findings.is_empty(),
            "Should detect file with 850 lines (threshold 800). Found: {:?}",
            findings.iter().map(|f| &f.title).collect::<Vec<_>>()
        );
        assert!(
            findings[0].title.contains("850"),
            "Title should mention line count"
        );
    }

    #[test]
    fn test_no_finding_for_small_file() {
        let content: String = (0..100).map(|i| format!("x_{} = {}\n", i, i)).collect();

        let store = GraphBuilder::new().freeze();
        let detector = LargeFilesDetector::new("/mock/repo");
        let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(
            &store,
            vec![("small_module.py", &content)],
        );
        let findings = detector.detect(&ctx).expect("detection should succeed");
        assert!(
            findings.is_empty(),
            "Should not flag file with 100 lines. Found: {:?}",
            findings.iter().map(|f| &f.title).collect::<Vec<_>>()
        );
    }
}