repotoire 0.5.3

//! Base detector trait and types
//!
//! This module defines the core abstractions for code smell detection:
//! - `Detector` trait that all detectors must implement
//! - `DetectorResult` for capturing execution results
//! - Helper types for detector configuration

use crate::models::{Finding, Severity};
use anyhow::Result;
use serde::{Deserialize, Serialize};
/// Generate a deterministic finding ID from detector name, file path, and line number (#73).
/// This enables proper dedup in incremental cache — Uuid::new_v4() creates new IDs each run.
///
/// Uses FNV-1a (64-bit) for cross-toolchain stability. `DefaultHasher` is explicitly
/// not guaranteed stable across Rust versions, which would silently invalidate the
/// incremental cache on toolchain upgrades.
pub fn finding_id(detector: &str, file: &str, line: u32) -> String {
    let mut h: u64 = 0xcbf29ce484222325; // FNV offset basis
    for b in detector.as_bytes().iter()
        .chain(&[0xff]) // separator
        .chain(file.as_bytes().iter())
        .chain(&[0xff])
        .chain(&line.to_le_bytes())
    {
        h ^= *b as u64;
        h = h.wrapping_mul(0x100000001b3); // FNV prime
    }
    format!("{:016x}", h)
}
use std::collections::HashMap;

/// Describes how much of the codebase a detector needs to produce findings.
/// Used by the analysis engine to decide which detectors to re-run on incremental updates.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum DetectorScope {
    /// Only reads file content. No graph queries. Can run on a single file in isolation.
    FileLocal,
    /// Uses graph but findings are attributed to specific files' entities.
    /// Can re-run for just the changed file's entities if graph is available.
    FileScopedGraph,
    /// Needs cross-file graph topology (SCC, fan-in/out, call chains).
    /// Must re-run on full graph if topology changes.
    GraphWide,
}

/// Result from running a single detector
#[derive(Debug, Clone)]
pub struct DetectorResult {
    /// Name of the detector that produced these results
    pub detector_name: String,
    /// Findings produced by the detector
    pub findings: Vec<Finding>,
    /// Execution time in milliseconds
    pub duration_ms: u64,
    /// Whether the detector completed successfully
    pub success: bool,
    /// Error message if the detector failed
    pub error: Option<String>,
}

impl DetectorResult {
    /// Create a successful result
    pub fn success(detector_name: String, findings: Vec<Finding>, duration_ms: u64) -> Self {
        Self {
            detector_name,
            findings,
            duration_ms,
            success: true,
            error: None,
        }
    }

    /// Create a failed result
    pub fn failure(detector_name: String, error: String, duration_ms: u64) -> Self {
        Self {
            detector_name,
            findings: Vec::new(),
            duration_ms,
            success: false,
            error: Some(error),
        }
    }

    /// Create a skipped result (used when early termination limit is reached)
    pub fn skipped(detector_name: &str) -> Self {
        Self {
            detector_name: detector_name.to_string(),
            findings: Vec::new(),
            duration_ms: 0,
            success: true,
            error: None,
        }
    }
}

/// Configuration options for detectors
#[derive(Debug, Clone, Default)]
pub struct DetectorConfig {
    /// Repository ID for multi-tenant filtering
    #[allow(dead_code)] // Multi-tenant support
    pub repo_id: Option<String>,
    /// Maximum findings to return per detector
    pub max_findings: Option<usize>,
    /// Detector-specific thresholds and options
    pub options: HashMap<String, serde_json::Value>,
    /// Coupling threshold multiplier based on project type (1.0 = web/CRUD, higher = more lenient)
    pub coupling_multiplier: f64,
    /// Complexity threshold multiplier based on project type
    pub complexity_multiplier: f64,
    /// Adaptive threshold resolver (from style profile)
    pub adaptive: crate::calibrate::ThresholdResolver,
}

impl DetectorConfig {
    /// Create a new config with default values
    pub fn new() -> Self {
        Self {
            repo_id: None,
            max_findings: None,
            options: HashMap::new(),
            coupling_multiplier: 1.0,
            complexity_multiplier: 1.0,
            adaptive: crate::calibrate::ThresholdResolver::default(),
        }
    }

    /// Create a config populated from project-level detector thresholds
    ///
    /// Looks up the detector by name in the project config and copies
    /// any threshold values into the options map.
    pub fn from_project_config(
        detector_name: &str,
        project_config: &crate::config::ProjectConfig,
    ) -> Self {
        let mut config = Self::new();

        // Normalize detector name for lookup (GodClassDetector -> god-class)
        let normalized = crate::config::normalize_detector_name(detector_name);

        // Look up detector config in project config
        if let Some(detector_override) = project_config
            .detectors
            .get(&normalized)
            .or_else(|| project_config.detectors.get(detector_name))
        {
            // Copy thresholds to options
            for (key, value) in &detector_override.thresholds {
                let json_value = match value {
                    crate::config::ThresholdValue::Integer(v) => serde_json::json!(*v),
                    crate::config::ThresholdValue::Float(v) => serde_json::json!(*v),
                    crate::config::ThresholdValue::Boolean(v) => serde_json::json!(*v),
                    crate::config::ThresholdValue::String(v) => serde_json::json!(v),
                };
                config.options.insert(key.clone(), json_value);
            }
        }

        config
    }

    /// Create a config with project type multipliers
    ///
    /// Uses the project type (auto-detected or explicit) to set coupling and complexity
    /// multipliers. Interpreters/VMs get more lenient thresholds than web apps.
    pub fn from_project_config_with_type(
        detector_name: &str,
        project_config: &crate::config::ProjectConfig,
        repo_path: &std::path::Path,
    ) -> Self {
        let mut config = Self::from_project_config(detector_name, project_config);
        let project_type = project_config.project_type(repo_path);
        config.coupling_multiplier = project_type.coupling_multiplier();
        config.complexity_multiplier = project_type.complexity_multiplier();
        config
    }

    /// Set adaptive threshold resolver from style profile
    pub fn with_adaptive(mut self, resolver: crate::calibrate::ThresholdResolver) -> Self {
        self.adaptive = resolver;
        self
    }

    /// Set the repository ID
    #[allow(dead_code)] // Builder method for multi-tenant support
    pub fn with_repo_id(mut self, repo_id: impl Into<String>) -> Self {
        self.repo_id = Some(repo_id.into());
        self
    }

    /// Set maximum findings
    #[allow(dead_code)] // Builder method for detector configuration
    pub fn with_max_findings(mut self, max: usize) -> Self {
        self.max_findings = Some(max);
        self
    }

    /// Set a custom option
    #[allow(dead_code)] // Builder method for detector configuration
    pub fn with_option(mut self, key: impl Into<String>, value: serde_json::Value) -> Self {
        self.options.insert(key.into(), value);
        self
    }

    /// Get a typed option value
    pub fn get_option<T: serde::de::DeserializeOwned>(&self, key: &str) -> Option<T> {
        self.options
            .get(key)
            .and_then(|v| serde_json::from_value(v.clone()).ok())
    }

    /// Get an option with a default value
    pub fn get_option_or<T: serde::de::DeserializeOwned>(&self, key: &str, default: T) -> T {
        self.get_option(key).unwrap_or(default)
    }
}

/// Check if a file path is non-production code (scripts, benchmarks, tools, examples, docs).
///
/// Findings on non-production files are downgraded to LOW severity in the
/// post-detection filter, since they don't represent production risk.
/// This is separate from test files (which are filtered out entirely).
pub fn is_non_production_file(path: &std::path::Path) -> bool {
    let path_str = path.to_string_lossy().to_lowercase();
    path_str.contains("/scripts/")
        || path_str.contains("/benchmarks/")
        || path_str.contains("/benchmark/")
        || path_str.contains("/tools/")
        || path_str.contains("/examples/")
        || path_str.contains("/example/")
        || path_str.contains("/docs/")
        || path_str.contains("/doc/")
        || path_str.contains("/contrib/")
        || path_str.contains("/misc/")
        || path_str.contains("/hack/")
        || path_str.contains("/utils/") && path_str.contains(".py") // utility scripts
        || path_str.starts_with("scripts/")
        || path_str.starts_with("benchmarks/")
        || path_str.starts_with("tools/")
        || path_str.starts_with("examples/")
        || path_str.starts_with("docs/")
}

/// Check if a file path appears to be a test file
/// Used by security detectors to avoid flagging test certificates, test fixtures, etc.
pub fn is_test_file(path: &std::path::Path) -> bool {
    let path_str = path.to_string_lossy().to_lowercase();
    let filename = path
        .file_name()
        .and_then(|s| s.to_str())
        .unwrap_or("")
        .to_lowercase();

    // Go test files
    path_str.ends_with("_test.go") ||
    // Python test files
    path_str.ends_with("_test.py") ||
    filename.starts_with("test_") ||  // test_foo.py
    // Test directories
    path_str.contains("/tests/") ||
    path_str.contains("/test/") ||
    path_str.contains("/__tests__/") ||
    path_str.contains("/e2e/") ||
    path_str.starts_with("tests/") ||
    path_str.starts_with("test/") ||
    // Ruby/JS spec files
    path_str.contains("/spec/") ||
    path_str.ends_with("_spec.rb") ||
    path_str.ends_with(".test.ts") ||
    path_str.ends_with(".test.js") ||
    path_str.ends_with(".test.tsx") ||
    path_str.ends_with(".test.jsx") ||
    path_str.ends_with(".spec.ts") ||
    path_str.ends_with(".spec.js") ||
    path_str.ends_with(".spec.tsx") ||
    path_str.ends_with(".spec.jsx") ||
    // Test fixtures/data
    path_str.contains("/fixtures/") ||
    path_str.contains("/testdata/") ||
    path_str.contains("/__fixtures__/") ||
    path_str.contains("/__mocks__/")
}

/// Check if a path string looks like a test/spec path using path-segment matching.
/// Unlike `contains("test")`, this won't match 'attestation', 'contest', etc. (#30)
pub fn is_test_path(path_str: &str) -> bool {
    let lower = path_str.to_lowercase();
    lower.contains("/test/")
        || lower.contains("/tests/")
        || lower.contains("/__tests__/")
        || lower.contains("/spec/")
        || lower.contains("/test_")
        || lower.contains("_test.")
        || lower.contains(".test.")
        || lower.contains(".spec.")
        || lower.contains("_spec.")
        // Handle relative paths starting with test directories
        || lower.starts_with("tests/")
        || lower.starts_with("test/")
        || lower.starts_with("__tests__/")
        || lower.starts_with("spec/")
}

/// Trait for all code smell detectors
///
/// Detectors analyze the code graph to find issues like:
/// - Circular dependencies
/// - God classes (classes that do too much)
/// - Long parameter lists
/// - Dead code
/// - And more...
///
/// # Example Implementation
///
/// ```ignore
/// pub struct MyDetector {
///     config: DetectorConfig,
/// }
///
/// impl Detector for MyDetector {
///     fn name(&self) -> &'static str {
///         "MyDetector"
///     }
///
///     fn description(&self) -> &'static str {
///         "Detects my specific code smell"
///     }
///
///     fn detect(&self, ctx: &super::analysis_context::AnalysisContext) -> Result<Vec<Finding>> {
///         let graph = ctx.graph;
///         // Query the graph and analyze results
///         Ok(vec![])
///     }
/// }
/// ```
pub trait Detector: Send + Sync {
    /// Unique identifier for this detector
    ///
    /// Should match the Python detector name for consistency
    /// (e.g., "CircularDependencyDetector")
    fn name(&self) -> &'static str;

    /// Human-readable description of what this detector finds
    fn description(&self) -> &'static str;

    /// Run detection and return findings
    ///
    /// This is the main entry point for detection. Implementations should:
    /// 1. Query the graph store for relevant data
    /// 2. Analyze the data to find issues
    /// 3. Return a list of findings with appropriate severity
    ///
    /// # Arguments
    /// * `ctx` - Unified analysis context containing graph, files, function contexts, taint, etc.
    ///
    /// # Returns
    /// A list of findings, or an error if detection fails
    fn detect(&self, ctx: &super::analysis_context::AnalysisContext) -> Result<Vec<Finding>>;

    /// Whether this detector depends on results from other detectors
    ///
    /// Dependent detectors run sequentially after all independent detectors
    /// have completed. This allows them to use findings from other detectors.
    ///
    /// Default: `false` (independent)
    fn is_dependent(&self) -> bool {
        false
    }

    /// Optional: Dependencies on other detectors
    ///
    /// Only meaningful if `is_dependent()` returns true.
    /// Returns names of detectors that must run before this one.
    #[allow(dead_code)] // Reserved for future dependent detector support
    fn dependencies(&self) -> Vec<&'static str> {
        vec![]
    }

    /// Category of issues this detector finds
    ///
    /// Used for grouping and filtering findings in reports.
    fn category(&self) -> &'static str {
        "code_smell"
    }

    /// Get the configuration for this detector
    fn config(&self) -> Option<&DetectorConfig> {
        None
    }

    /// Scope of this detector - determines when it needs to re-run
    ///
    /// - `FileLocal`: Only analyzes individual files, can be cached per-file
    /// - `FileScopedGraph`: Uses graph but findings are per-file entity
    /// - `GraphWide`: Uses full graph topology, re-run if graph structure changes
    ///
    /// Default is GraphWide (conservative - always re-runs)
    fn scope(&self) -> DetectorScope {
        DetectorScope::GraphWide
    }

    /// Returns the scope of this detector for incremental analysis.
    /// FileLocal: re-run only on changed files
    /// FileScopedGraph: re-run for changed files' entities
    /// GraphWide: re-run if graph topology changes
    fn detector_scope(&self) -> DetectorScope {
        if self.requires_graph() {
            DetectorScope::FileScopedGraph
        } else {
            DetectorScope::FileLocal
        }
    }

    /// Whether this detector requires the full code graph to be built.
    ///
    /// Detectors that only analyze file content (magic numbers, deep nesting,
    /// security patterns, etc.) can return `false` to run speculatively
    /// in parallel with graph building.
    ///
    /// Default: `true` (conservative — waits for graph completion)
    fn requires_graph(&self) -> bool {
        true
    }

    /// Inject pre-computed taint analysis results into this detector.
    ///
    /// Called by the engine before `detect()` for security detectors that
    /// use taint analysis. The `cross` paths come from BFS `trace_taint()`
    /// and the `intra` paths from file-based heuristic analysis.
    ///
    /// Security detectors override this to store results in an `OnceLock`,
    /// then check for pre-computed results in their `detect()` method.
    ///
    /// Default: no-op (non-taint detectors ignore this).
    fn set_precomputed_taint(
        &self,
        _cross: Vec<super::taint::TaintPath>,
        _intra: Vec<super::taint::TaintPath>,
    ) {
        // Default: no-op
    }

    /// Return the taint category this detector uses, if any.
    ///
    /// Used by the engine to dispatch the correct pre-computed taint results.
    /// Only security detectors that use taint analysis need to override this.
    fn taint_category(&self) -> Option<super::taint::TaintCategory> {
        None
    }

    /// File extensions this detector processes.
    ///
    /// Return empty slice for graph-only detectors that don't scan files.
    /// Engine uses this to pre-filter files before calling detect().
    fn file_extensions(&self) -> &'static [&'static str] {
        &[]
    }

    /// Content flags required for files this detector processes.
    ///
    /// Files without ANY of these flags are skipped.
    /// Return `ContentFlags::empty()` to receive all files (no filtering).
    fn content_requirements(&self) -> super::detector_context::ContentFlags {
        super::detector_context::ContentFlags::empty()
    }

    /// Whether this detector produces mathematically deterministic results.
    ///
    /// Deterministic detectors use provable graph algorithms (dominator trees,
    /// SCCs, articulation points) rather than heuristics. Their findings should
    /// NOT be filtered by statistical FP classifiers (GBDT) since the results
    /// are mathematically certain, not probabilistic.
    ///
    /// Default: false (most detectors are heuristic-based).
    fn is_deterministic(&self) -> bool {
        false
    }

    /// Whether this detector makes network calls (e.g., API requests).
    /// Network-bound detectors are skipped in incremental mode and their
    /// cached findings are carried forward instead.
    fn is_network_bound(&self) -> bool {
        false
    }

    /// Whether this detector's findings should bypass GBDT postprocessor filtering.
    /// High-precision pattern-based detectors should return true.
    fn bypass_postprocessor(&self) -> bool {
        false
    }

}

/// Progress callback for detector execution
pub type ProgressCallback = Box<dyn Fn(&str, usize, usize) + Send + Sync>;

/// Summary statistics from running all detectors
#[derive(Debug, Clone, Default)]
pub struct DetectionSummary {
    /// Total number of detectors run
    pub detectors_run: usize,
    /// Number of detectors that succeeded
    pub detectors_succeeded: usize,
    /// Number of detectors that failed
    pub detectors_failed: usize,
    /// Total findings across all detectors
    pub total_findings: usize,
    /// Findings by severity
    pub by_severity: HashMap<Severity, usize>,
    /// Total execution time in milliseconds
    pub total_duration_ms: u64,
}

impl DetectionSummary {
    /// Update summary with a detector result
    pub fn add_result(&mut self, result: &DetectorResult) {
        self.detectors_run += 1;
        self.total_duration_ms += result.duration_ms;

        if result.success {
            self.detectors_succeeded += 1;
            self.total_findings += result.findings.len();

            for finding in &result.findings {
                *self.by_severity.entry(finding.severity).or_insert(0) += 1;
            }
        } else {
            self.detectors_failed += 1;
        }
    }
}

/// Pre-compile glob patterns from exclude list into regexes
pub fn compile_glob_patterns(patterns: &[String]) -> Vec<regex::Regex> {
    patterns
        .iter()
        .filter(|p| p.contains('*'))
        .filter_map(|p| {
            let re_str = format!("^{}$", p.replace('*', ".*"));
            regex::Regex::new(&re_str).ok()
        })
        .collect()
}

/// Check if a path should be excluded based on patterns and pre-compiled globs
pub fn should_exclude_path(
    path: &str,
    patterns: &[String],
    compiled_globs: &[regex::Regex],
) -> bool {
    for pattern in patterns {
        if pattern.ends_with('/') {
            let dir = pattern.trim_end_matches('/');
            if path.split('/').any(|p| p == dir) {
                return true;
            }
        } else if pattern.contains('*') {
            continue; // handled by compiled_globs below
        } else if path.contains(pattern) {
            return true;
        }
    }
    let filename = std::path::Path::new(path)
        .file_name()
        .and_then(|s| s.to_str())
        .unwrap_or("");
    for re in compiled_globs {
        if re.is_match(path) || re.is_match(filename) {
            return true;
        }
    }
    false
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_detector_config() {
        let config = DetectorConfig::new()
            .with_repo_id("test-repo")
            .with_max_findings(100)
            .with_option("threshold", serde_json::json!(10));

        assert_eq!(config.repo_id, Some("test-repo".to_string()));
        assert_eq!(config.max_findings, Some(100));
        assert_eq!(config.get_option::<i32>("threshold"), Some(10));
        assert_eq!(config.get_option_or("missing", 5), 5);
    }

    #[test]
    fn test_detector_result_success() {
        let result = DetectorResult::success("TestDetector".to_string(), vec![], 100);
        assert!(result.success);
        assert!(result.error.is_none());
        assert_eq!(result.duration_ms, 100);
    }

    #[test]
    fn test_detector_result_failure() {
        let result = DetectorResult::failure("TestDetector".to_string(), "oops".to_string(), 50);
        assert!(!result.success);
        assert_eq!(result.error, Some("oops".to_string()));
    }

    #[test]
    fn test_detection_summary() {
        let mut summary = DetectionSummary::default();

        let result1 = DetectorResult::success("D1".to_string(), vec![], 100);
        let result2 = DetectorResult::failure("D2".to_string(), "err".to_string(), 50);

        summary.add_result(&result1);
        summary.add_result(&result2);

        assert_eq!(summary.detectors_run, 2);
        assert_eq!(summary.detectors_succeeded, 1);
        assert_eq!(summary.detectors_failed, 1);
        assert_eq!(summary.total_duration_ms, 150);
    }

    #[test]
    fn test_is_test_file() {
        use super::is_test_file;
        use std::path::Path;

        assert!(is_test_file(Path::new("foo_test.go")));
        assert!(is_test_file(Path::new("test_foo.py")));
        assert!(is_test_file(Path::new("src/tests/helper.py")));
        assert!(is_test_file(Path::new("app.spec.ts")));
        assert!(!is_test_file(Path::new("src/main.py")));
        assert!(!is_test_file(Path::new("testing_utils.py"))); // "testing" != "test"
    }

    #[test]
    fn test_requires_graph_annotation_coverage() {
        let tmp = tempfile::tempdir().expect("create tempdir");
        let init = crate::detectors::DetectorInit::test_default();
        let detectors = crate::detectors::create_all_detectors(&init);

        let graph_independent: Vec<_> = detectors
            .iter()
            .filter(|d| !d.requires_graph())
            .map(|d| d.name())
            .collect();

        let graph_dependent: Vec<_> = detectors
            .iter()
            .filter(|d| d.requires_graph())
            .map(|d| d.name())
            .collect();

        println!(
            "Graph-independent detectors ({}): {:?}",
            graph_independent.len(),
            graph_independent
        );
        println!(
            "Graph-dependent detectors ({}): {:?}",
            graph_dependent.len(),
            graph_dependent
        );

        // At minimum 34 detectors should be graph-independent
        assert!(
            graph_independent.len() >= 34,
            "Expected >= 34 graph-independent detectors, got {}",
            graph_independent.len()
        );
    }
}