Skip to main content

repotoire/detectors/
base.rs

1//! Base detector trait and types
2//!
3//! This module defines the core abstractions for code smell detection:
4//! - `Detector` trait that all detectors must implement
5//! - `DetectorResult` for capturing execution results
6//! - Helper types for detector configuration
7
8use crate::models::{Finding, Severity};
9use anyhow::Result;
10use serde::{Deserialize, Serialize};
11/// Generate a deterministic finding ID from detector name, file path, and line number (#73).
12/// This enables proper dedup in incremental cache — Uuid::new_v4() creates new IDs each run.
13///
14/// Uses FNV-1a (64-bit) for cross-toolchain stability. `DefaultHasher` is explicitly
15/// not guaranteed stable across Rust versions, which would silently invalidate the
16/// incremental cache on toolchain upgrades.
17pub fn finding_id(detector: &str, file: &str, line: u32) -> String {
18    let mut h: u64 = 0xcbf29ce484222325; // FNV offset basis
19    for b in detector
20        .as_bytes()
21        .iter()
22        .chain(&[0xff]) // separator
23        .chain(file.as_bytes().iter())
24        .chain(&[0xff])
25        .chain(&line.to_le_bytes())
26    {
27        h ^= *b as u64;
28        h = h.wrapping_mul(0x100000001b3); // FNV prime
29    }
30    format!("{:016x}", h)
31}
32use std::collections::HashMap;
33
34/// Describes how much of the codebase a detector needs to produce findings.
35/// Used by the analysis engine to decide which detectors to re-run on incremental updates.
36#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
37pub enum DetectorScope {
38    /// Only reads file content. No graph queries. Can run on a single file in isolation.
39    FileLocal,
40    /// Uses graph but findings are attributed to specific files' entities.
41    /// Can re-run for just the changed file's entities if graph is available.
42    FileScopedGraph,
43    /// Needs cross-file graph topology (SCC, fan-in/out, call chains).
44    /// Must re-run on full graph if topology changes.
45    GraphWide,
46}
47
48/// Result from running a single detector
49#[derive(Debug, Clone)]
50pub struct DetectorResult {
51    /// Name of the detector that produced these results
52    pub detector_name: String,
53    /// Findings produced by the detector
54    pub findings: Vec<Finding>,
55    /// Execution time in milliseconds
56    pub duration_ms: u64,
57    /// Whether the detector completed successfully
58    pub success: bool,
59    /// Error message if the detector failed
60    pub error: Option<String>,
61}
62
63impl DetectorResult {
64    /// Create a successful result
65    pub fn success(detector_name: String, findings: Vec<Finding>, duration_ms: u64) -> Self {
66        Self {
67            detector_name,
68            findings,
69            duration_ms,
70            success: true,
71            error: None,
72        }
73    }
74
75    /// Create a failed result
76    pub fn failure(detector_name: String, error: String, duration_ms: u64) -> Self {
77        Self {
78            detector_name,
79            findings: Vec::new(),
80            duration_ms,
81            success: false,
82            error: Some(error),
83        }
84    }
85
86    /// Create a skipped result (used when early termination limit is reached)
87    pub fn skipped(detector_name: &str) -> Self {
88        Self {
89            detector_name: detector_name.to_string(),
90            findings: Vec::new(),
91            duration_ms: 0,
92            success: true,
93            error: None,
94        }
95    }
96}
97
98/// Configuration options for detectors
99#[derive(Debug, Clone, Default)]
100pub struct DetectorConfig {
101    /// Repository ID for multi-tenant filtering
102    #[allow(dead_code)] // Multi-tenant support
103    pub repo_id: Option<String>,
104    /// Maximum findings to return per detector
105    pub max_findings: Option<usize>,
106    /// Detector-specific thresholds and options
107    pub options: HashMap<String, serde_json::Value>,
108    /// Coupling threshold multiplier based on project type (1.0 = web/CRUD, higher = more lenient)
109    pub coupling_multiplier: f64,
110    /// Complexity threshold multiplier based on project type
111    pub complexity_multiplier: f64,
112    /// Adaptive threshold resolver (from style profile)
113    pub adaptive: crate::calibrate::ThresholdResolver,
114}
115
116impl DetectorConfig {
117    /// Create a new config with default values
118    pub fn new() -> Self {
119        Self {
120            repo_id: None,
121            max_findings: None,
122            options: HashMap::new(),
123            coupling_multiplier: 1.0,
124            complexity_multiplier: 1.0,
125            adaptive: crate::calibrate::ThresholdResolver::default(),
126        }
127    }
128
129    /// Create a config populated from project-level detector thresholds
130    ///
131    /// Looks up the detector by name in the project config and copies
132    /// any threshold values into the options map.
133    pub fn from_project_config(
134        detector_name: &str,
135        project_config: &crate::config::ProjectConfig,
136    ) -> Self {
137        let mut config = Self::new();
138
139        // Normalize detector name for lookup (GodClassDetector -> god-class)
140        let normalized = crate::config::normalize_detector_name(detector_name);
141
142        // Look up detector config in project config
143        if let Some(detector_override) = project_config
144            .detectors
145            .get(&normalized)
146            .or_else(|| project_config.detectors.get(detector_name))
147        {
148            // Copy thresholds to options
149            for (key, value) in &detector_override.thresholds {
150                let json_value = match value {
151                    crate::config::ThresholdValue::Integer(v) => serde_json::json!(*v),
152                    crate::config::ThresholdValue::Float(v) => serde_json::json!(*v),
153                    crate::config::ThresholdValue::Boolean(v) => serde_json::json!(*v),
154                    crate::config::ThresholdValue::String(v) => serde_json::json!(v),
155                };
156                config.options.insert(key.clone(), json_value);
157            }
158        }
159
160        config
161    }
162
163    /// Create a config with project type multipliers
164    ///
165    /// Uses the project type (auto-detected or explicit) to set coupling and complexity
166    /// multipliers. Interpreters/VMs get more lenient thresholds than web apps.
167    pub fn from_project_config_with_type(
168        detector_name: &str,
169        project_config: &crate::config::ProjectConfig,
170        repo_path: &std::path::Path,
171    ) -> Self {
172        let mut config = Self::from_project_config(detector_name, project_config);
173        let project_type = project_config.project_type(repo_path);
174        config.coupling_multiplier = project_type.coupling_multiplier();
175        config.complexity_multiplier = project_type.complexity_multiplier();
176        config
177    }
178
179    /// Set adaptive threshold resolver from style profile
180    pub fn with_adaptive(mut self, resolver: crate::calibrate::ThresholdResolver) -> Self {
181        self.adaptive = resolver;
182        self
183    }
184
185    /// Set the repository ID
186    #[allow(dead_code)] // Builder method for multi-tenant support
187    pub fn with_repo_id(mut self, repo_id: impl Into<String>) -> Self {
188        self.repo_id = Some(repo_id.into());
189        self
190    }
191
192    /// Set maximum findings
193    #[allow(dead_code)] // Builder method for detector configuration
194    pub fn with_max_findings(mut self, max: usize) -> Self {
195        self.max_findings = Some(max);
196        self
197    }
198
199    /// Set a custom option
200    #[allow(dead_code)] // Builder method for detector configuration
201    pub fn with_option(mut self, key: impl Into<String>, value: serde_json::Value) -> Self {
202        self.options.insert(key.into(), value);
203        self
204    }
205
206    /// Get a typed option value
207    pub fn get_option<T: serde::de::DeserializeOwned>(&self, key: &str) -> Option<T> {
208        self.options
209            .get(key)
210            .and_then(|v| serde_json::from_value(v.clone()).ok())
211    }
212
213    /// Get an option with a default value
214    pub fn get_option_or<T: serde::de::DeserializeOwned>(&self, key: &str, default: T) -> T {
215        self.get_option(key).unwrap_or(default)
216    }
217}
218
219/// Check if a file path is non-production code (scripts, benchmarks, tools, examples, docs).
220///
221/// Findings on non-production files are downgraded to LOW severity in the
222/// post-detection filter, since they don't represent production risk.
223/// This is separate from test files (which are filtered out entirely).
224pub fn is_non_production_file(path: &std::path::Path) -> bool {
225    let path_str = path.to_string_lossy().to_lowercase();
226    path_str.contains("/scripts/")
227        || path_str.contains("/benchmarks/")
228        || path_str.contains("/benchmark/")
229        || path_str.contains("/tools/")
230        || path_str.contains("/examples/")
231        || path_str.contains("/example/")
232        || path_str.contains("/docs/")
233        || path_str.contains("/doc/")
234        || path_str.contains("/contrib/")
235        || path_str.contains("/misc/")
236        || path_str.contains("/hack/")
237        || path_str.contains("/utils/") && path_str.contains(".py") // utility scripts
238        || path_str.starts_with("scripts/")
239        || path_str.starts_with("benchmarks/")
240        || path_str.starts_with("tools/")
241        || path_str.starts_with("examples/")
242        || path_str.starts_with("docs/")
243}
244
245/// Check if a file path appears to be a test file
246/// Used by security detectors to avoid flagging test certificates, test fixtures, etc.
247pub fn is_test_file(path: &std::path::Path) -> bool {
248    let path_str = path.to_string_lossy().to_lowercase();
249    let filename = path
250        .file_name()
251        .and_then(|s| s.to_str())
252        .unwrap_or("")
253        .to_lowercase();
254
255    // Go test files
256    path_str.ends_with("_test.go") ||
257    // Python test files
258    path_str.ends_with("_test.py") ||
259    filename.starts_with("test_") ||  // test_foo.py
260    // Test directories
261    path_str.contains("/tests/") ||
262    path_str.contains("/test/") ||
263    path_str.contains("/__tests__/") ||
264    path_str.contains("/e2e/") ||
265    path_str.starts_with("tests/") ||
266    path_str.starts_with("test/") ||
267    path_str.starts_with("__tests__/") ||
268    // Ruby/JS spec files
269    path_str.contains("/spec/") ||
270    path_str.ends_with("_spec.rb") ||
271    path_str.ends_with(".test.ts") ||
272    path_str.ends_with(".test.js") ||
273    path_str.ends_with(".test.tsx") ||
274    path_str.ends_with(".test.jsx") ||
275    path_str.ends_with(".spec.ts") ||
276    path_str.ends_with(".spec.js") ||
277    path_str.ends_with(".spec.tsx") ||
278    path_str.ends_with(".spec.jsx") ||
279    // Test fixtures/data
280    path_str.contains("/fixtures/") ||
281    path_str.contains("/testdata/") ||
282    path_str.contains("/__fixtures__/") ||
283    path_str.contains("/__mocks__/")
284}
285
286/// Check if a path string looks like a test/spec path using path-segment matching.
287/// Unlike `contains("test")`, this won't match 'attestation', 'contest', etc. (#30)
288pub fn is_test_path(path_str: &str) -> bool {
289    let lower = path_str.to_lowercase();
290    lower.contains("/test/")
291        || lower.contains("/tests/")
292        || lower.contains("/__tests__/")
293        || lower.contains("/spec/")
294        || lower.contains("/test_")
295        || lower.contains("_test.")
296        || lower.contains(".test.")
297        || lower.contains(".spec.")
298        || lower.contains("_spec.")
299        // Handle relative paths starting with test directories
300        || lower.starts_with("tests/")
301        || lower.starts_with("test/")
302        || lower.starts_with("__tests__/")
303        || lower.starts_with("spec/")
304}
305
306/// Trait for all code smell detectors
307///
308/// Detectors analyze the code graph to find issues like:
309/// - Circular dependencies
310/// - God classes (classes that do too much)
311/// - Long parameter lists
312/// - Dead code
313/// - And more...
314///
315/// # Example Implementation
316///
317/// ```ignore
318/// pub struct MyDetector {
319///     config: DetectorConfig,
320/// }
321///
322/// impl Detector for MyDetector {
323///     fn name(&self) -> &'static str {
324///         "MyDetector"
325///     }
326///
327///     fn description(&self) -> &'static str {
328///         "Detects my specific code smell"
329///     }
330///
331///     fn detect(&self, ctx: &super::analysis_context::AnalysisContext) -> Result<Vec<Finding>> {
332///         let graph = ctx.graph;
333///         // Query the graph and analyze results
334///         Ok(vec![])
335///     }
336/// }
337/// ```
338pub trait Detector: Send + Sync {
339    /// Unique identifier for this detector
340    ///
341    /// Should match the Python detector name for consistency
342    /// (e.g., "CircularDependencyDetector")
343    fn name(&self) -> &'static str;
344
345    /// Human-readable description of what this detector finds
346    fn description(&self) -> &'static str;
347
348    /// Run detection and return findings
349    ///
350    /// This is the main entry point for detection. Implementations should:
351    /// 1. Query the graph store for relevant data
352    /// 2. Analyze the data to find issues
353    /// 3. Return a list of findings with appropriate severity
354    ///
355    /// # Arguments
356    /// * `ctx` - Unified analysis context containing graph, files, function contexts, taint, etc.
357    ///
358    /// # Returns
359    /// A list of findings, or an error if detection fails
360    fn detect(&self, ctx: &super::analysis_context::AnalysisContext) -> Result<Vec<Finding>>;
361
362    /// Whether this detector depends on results from other detectors
363    ///
364    /// Dependent detectors run sequentially after all independent detectors
365    /// have completed. This allows them to use findings from other detectors.
366    ///
367    /// Default: `false` (independent)
368    fn is_dependent(&self) -> bool {
369        false
370    }
371
372    /// Optional: Dependencies on other detectors
373    ///
374    /// Only meaningful if `is_dependent()` returns true.
375    /// Returns names of detectors that must run before this one.
376    #[allow(dead_code)] // Reserved for future dependent detector support
377    fn dependencies(&self) -> Vec<&'static str> {
378        vec![]
379    }
380
381    /// Category of issues this detector finds
382    ///
383    /// Used for grouping and filtering findings in reports.
384    fn category(&self) -> &'static str {
385        "code_smell"
386    }
387
388    /// Get the configuration for this detector
389    fn config(&self) -> Option<&DetectorConfig> {
390        None
391    }
392
393    /// Scope of this detector - determines when it needs to re-run
394    ///
395    /// - `FileLocal`: Only analyzes individual files, can be cached per-file
396    /// - `FileScopedGraph`: Uses graph but findings are per-file entity
397    /// - `GraphWide`: Uses full graph topology, re-run if graph structure changes
398    ///
399    /// Default is GraphWide (conservative - always re-runs)
400    fn scope(&self) -> DetectorScope {
401        DetectorScope::GraphWide
402    }
403
404    /// Returns the scope of this detector for incremental analysis.
405    /// FileLocal: re-run only on changed files
406    /// FileScopedGraph: re-run for changed files' entities
407    /// GraphWide: re-run if graph topology changes
408    fn detector_scope(&self) -> DetectorScope {
409        if self.requires_graph() {
410            DetectorScope::FileScopedGraph
411        } else {
412            DetectorScope::FileLocal
413        }
414    }
415
416    /// Whether this detector requires the full code graph to be built.
417    ///
418    /// Detectors that only analyze file content (magic numbers, deep nesting,
419    /// security patterns, etc.) can return `false` to run speculatively
420    /// in parallel with graph building.
421    ///
422    /// Default: `true` (conservative — waits for graph completion)
423    fn requires_graph(&self) -> bool {
424        true
425    }
426
427    /// Inject pre-computed taint analysis results into this detector.
428    ///
429    /// Called by the engine before `detect()` for security detectors that
430    /// use taint analysis. The `cross` paths come from BFS `trace_taint()`
431    /// and the `intra` paths from file-based heuristic analysis.
432    ///
433    /// Security detectors override this to store results in an `OnceLock`,
434    /// then check for pre-computed results in their `detect()` method.
435    ///
436    /// Default: no-op (non-taint detectors ignore this).
437    fn set_precomputed_taint(
438        &self,
439        _cross: Vec<super::taint::TaintPath>,
440        _intra: Vec<super::taint::TaintPath>,
441    ) {
442        // Default: no-op
443    }
444
445    /// Return the taint category this detector uses, if any.
446    ///
447    /// Used by the engine to dispatch the correct pre-computed taint results.
448    /// Only security detectors that use taint analysis need to override this.
449    fn taint_category(&self) -> Option<super::taint::TaintCategory> {
450        None
451    }
452
453    /// File extensions this detector processes.
454    ///
455    /// Return empty slice for graph-only detectors that don't scan files.
456    /// Engine uses this to pre-filter files before calling detect().
457    fn file_extensions(&self) -> &'static [&'static str] {
458        &[]
459    }
460
461    /// Content flags required for files this detector processes.
462    ///
463    /// Files without ANY of these flags are skipped.
464    /// Return `ContentFlags::empty()` to receive all files (no filtering).
465    fn content_requirements(&self) -> super::detector_context::ContentFlags {
466        super::detector_context::ContentFlags::empty()
467    }
468
469    /// Whether this detector produces mathematically deterministic results.
470    ///
471    /// Deterministic detectors use provable graph algorithms (dominator trees,
472    /// SCCs, articulation points) rather than heuristics. Their findings should
473    /// NOT be filtered by statistical FP classifiers (GBDT) since the results
474    /// are mathematically certain, not probabilistic.
475    ///
476    /// Default: false (most detectors are heuristic-based).
477    fn is_deterministic(&self) -> bool {
478        false
479    }
480
481    /// Whether this detector makes network calls (e.g., API requests).
482    /// Network-bound detectors are skipped in incremental mode and their
483    /// cached findings are carried forward instead.
484    fn is_network_bound(&self) -> bool {
485        false
486    }
487
488    /// Whether this detector's findings should bypass GBDT postprocessor filtering.
489    /// High-precision pattern-based detectors should return true.
490    fn bypass_postprocessor(&self) -> bool {
491        false
492    }
493}
494
495/// Progress callback for detector execution
496pub type ProgressCallback = Box<dyn Fn(&str, usize, usize) + Send + Sync>;
497
498/// Summary statistics from running all detectors
499#[derive(Debug, Clone, Default)]
500pub struct DetectionSummary {
501    /// Total number of detectors run
502    pub detectors_run: usize,
503    /// Number of detectors that succeeded
504    pub detectors_succeeded: usize,
505    /// Number of detectors that failed
506    pub detectors_failed: usize,
507    /// Total findings across all detectors
508    pub total_findings: usize,
509    /// Findings by severity
510    pub by_severity: HashMap<Severity, usize>,
511    /// Total execution time in milliseconds
512    pub total_duration_ms: u64,
513}
514
515impl DetectionSummary {
516    /// Update summary with a detector result
517    pub fn add_result(&mut self, result: &DetectorResult) {
518        self.detectors_run += 1;
519        self.total_duration_ms += result.duration_ms;
520
521        if result.success {
522            self.detectors_succeeded += 1;
523            self.total_findings += result.findings.len();
524
525            for finding in &result.findings {
526                *self.by_severity.entry(finding.severity).or_insert(0) += 1;
527            }
528        } else {
529            self.detectors_failed += 1;
530        }
531    }
532}
533
534/// Pre-compile glob patterns from exclude list into regexes
535pub fn compile_glob_patterns(patterns: &[String]) -> Vec<regex::Regex> {
536    patterns
537        .iter()
538        .filter(|p| p.contains('*'))
539        .filter_map(|p| {
540            let re_str = format!("^{}$", p.replace('*', ".*"));
541            regex::Regex::new(&re_str).ok()
542        })
543        .collect()
544}
545
546/// Check if a path should be excluded based on patterns and pre-compiled globs
547pub fn should_exclude_path(
548    path: &str,
549    patterns: &[String],
550    compiled_globs: &[regex::Regex],
551) -> bool {
552    for pattern in patterns {
553        if pattern.ends_with('/') {
554            let dir = pattern.trim_end_matches('/');
555            if path.split('/').any(|p| p == dir) {
556                return true;
557            }
558        } else if pattern.contains('*') {
559            continue; // handled by compiled_globs below
560        } else if path.contains(pattern) {
561            return true;
562        }
563    }
564    let filename = std::path::Path::new(path)
565        .file_name()
566        .and_then(|s| s.to_str())
567        .unwrap_or("");
568    for re in compiled_globs {
569        if re.is_match(path) || re.is_match(filename) {
570            return true;
571        }
572    }
573    false
574}
575
576#[cfg(test)]
577mod tests {
578    use super::*;
579
580    #[test]
581    fn test_detector_config() {
582        let config = DetectorConfig::new()
583            .with_repo_id("test-repo")
584            .with_max_findings(100)
585            .with_option("threshold", serde_json::json!(10));
586
587        assert_eq!(config.repo_id, Some("test-repo".to_string()));
588        assert_eq!(config.max_findings, Some(100));
589        assert_eq!(config.get_option::<i32>("threshold"), Some(10));
590        assert_eq!(config.get_option_or("missing", 5), 5);
591    }
592
593    #[test]
594    fn test_detector_result_success() {
595        let result = DetectorResult::success("TestDetector".to_string(), vec![], 100);
596        assert!(result.success);
597        assert!(result.error.is_none());
598        assert_eq!(result.duration_ms, 100);
599    }
600
601    #[test]
602    fn test_detector_result_failure() {
603        let result = DetectorResult::failure("TestDetector".to_string(), "oops".to_string(), 50);
604        assert!(!result.success);
605        assert_eq!(result.error, Some("oops".to_string()));
606    }
607
608    #[test]
609    fn test_detection_summary() {
610        let mut summary = DetectionSummary::default();
611
612        let result1 = DetectorResult::success("D1".to_string(), vec![], 100);
613        let result2 = DetectorResult::failure("D2".to_string(), "err".to_string(), 50);
614
615        summary.add_result(&result1);
616        summary.add_result(&result2);
617
618        assert_eq!(summary.detectors_run, 2);
619        assert_eq!(summary.detectors_succeeded, 1);
620        assert_eq!(summary.detectors_failed, 1);
621        assert_eq!(summary.total_duration_ms, 150);
622    }
623
624    #[test]
625    fn test_is_test_file() {
626        use super::is_test_file;
627        use std::path::Path;
628
629        assert!(is_test_file(Path::new("foo_test.go")));
630        assert!(is_test_file(Path::new("test_foo.py")));
631        assert!(is_test_file(Path::new("src/tests/helper.py")));
632        assert!(is_test_file(Path::new("app.spec.ts")));
633        assert!(!is_test_file(Path::new("src/main.py")));
634        assert!(!is_test_file(Path::new("testing_utils.py"))); // "testing" != "test"
635    }
636
637    #[test]
638    fn test_requires_graph_annotation_coverage() {
639        let _tmp = tempfile::tempdir().expect("create tempdir");
640        let init = crate::detectors::DetectorInit::test_default();
641        let detectors = crate::detectors::create_all_detectors(&init);
642
643        let graph_independent: Vec<_> = detectors
644            .iter()
645            .filter(|d| !d.requires_graph())
646            .map(|d| d.name())
647            .collect();
648
649        let graph_dependent: Vec<_> = detectors
650            .iter()
651            .filter(|d| d.requires_graph())
652            .map(|d| d.name())
653            .collect();
654
655        println!(
656            "Graph-independent detectors ({}): {:?}",
657            graph_independent.len(),
658            graph_independent
659        );
660        println!(
661            "Graph-dependent detectors ({}): {:?}",
662            graph_dependent.len(),
663            graph_dependent
664        );
665
666        // At minimum 34 detectors should be graph-independent
667        assert!(
668            graph_independent.len() >= 34,
669            "Expected >= 34 graph-independent detectors, got {}",
670            graph_independent.len()
671        );
672    }
673}