antislop/detector/
mod.rs

1//! Slop detection engine.
2//!
3//! This module provides the core scanning functionality, extracting comments
4//! and matching against slop patterns.
5
6mod patterns;
7mod regex_fallback;
8
9#[cfg(feature = "tree-sitter")]
10mod tree_sitter;
11
12pub use patterns::{CompiledPattern, PatternRegistry};
13pub use regex_fallback::RegexExtractor;
14
15use crate::config::{Pattern, PatternCategory, Severity};
16use crate::Result;
17use std::collections::HashMap;
18use std::path::Path;
19
20/// A comment extracted from source code.
21#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize)]
22pub struct Comment {
23    /// Line number (1-indexed).
24    pub line: usize,
25    /// Column number (1-indexed).
26    pub column: usize,
27    /// The comment text content.
28    pub content: String,
29}
30
31/// A single slop finding.
32#[derive(Debug, Clone, serde::Serialize)]
33pub struct Finding {
34    /// File path.
35    pub file: String,
36    /// Line number (1-indexed).
37    pub line: usize,
38    /// Column number (1-indexed).
39    pub column: usize,
40    /// Severity level.
41    pub severity: Severity,
42    /// Pattern category.
43    pub category: PatternCategory,
44    /// Human-readable message.
45    pub message: String,
46    /// The matched text.
47    pub match_text: String,
48    /// The regex pattern that matched.
49    pub pattern_regex: String,
50}
51
52/// Result of scanning a single file.
53#[derive(Debug, Clone, serde::Serialize)]
54pub struct FileScanResult {
55    /// File path.
56    pub path: String,
57    /// All findings in this file.
58    pub findings: Vec<Finding>,
59    /// Total slop score for this file.
60    pub score: u32,
61}
62
63/// Summary of a scan operation.
64#[derive(Debug, Clone, serde::Serialize)]
65pub struct ScanSummary {
66    /// Number of files scanned.
67    pub files_scanned: usize,
68    /// Number of files with findings.
69    pub files_with_findings: usize,
70    /// Total number of findings.
71    pub total_findings: usize,
72    /// Total slop score across all files.
73    pub total_score: u32,
74    /// Findings grouped by severity.
75    pub by_severity: HashMap<Severity, usize>,
76    /// Findings grouped by category.
77    pub by_category: HashMap<PatternCategory, usize>,
78}
79
80impl ScanSummary {
81    /// Create a summary from scan results.
82    pub fn new(results: &[FileScanResult]) -> Self {
83        let mut summary = Self {
84            files_scanned: results.len(),
85            files_with_findings: 0,
86            total_findings: 0,
87            total_score: 0,
88            by_severity: HashMap::new(),
89            by_category: HashMap::new(),
90        };
91
92        for result in results {
93            if !result.findings.is_empty() {
94                summary.files_with_findings += 1;
95            }
96            summary.total_findings += result.findings.len();
97            summary.total_score += result.score;
98
99            for finding in &result.findings {
100                *summary
101                    .by_severity
102                    .entry(finding.severity.clone())
103                    .or_insert(0) += 1;
104                *summary
105                    .by_category
106                    .entry(finding.category.clone())
107                    .or_insert(0) += 1;
108            }
109        }
110
111        summary
112    }
113}
114
115/// Language detection strategy.
116#[derive(Debug, Clone, Copy, PartialEq, Eq)]
117pub enum Language {
118    /// Python source.
119    Python,
120    /// JavaScript.
121    JavaScript,
122    /// TypeScript.
123    TypeScript,
124    /// JSX (React).
125    Jsx,
126    /// TSX (React TypeScript).
127    Tsx,
128    /// Rust.
129    Rust,
130    /// Go.
131    Go,
132    /// Java.
133    Java,
134    /// Kotlin.
135    Kotlin,
136    /// C/C++.
137    CCpp,
138    /// C#.
139    CSharp,
140    /// Ruby.
141    Ruby,
142    /// PHP.
143    Php,
144    /// Swift.
145    Swift,
146    /// Haskell.
147    Haskell,
148    /// Lua.
149    Lua,
150    /// Perl.
151    Perl,
152    /// R.
153    R,
154    /// Scala.
155    Scala,
156    /// Shell scripts.
157    Shell,
158    /// Unknown language.
159    Unknown,
160}
161
162impl Language {
163    /// Detect language from file extension.
164    pub fn from_path(path: &Path) -> Self {
165        path.extension()
166            .and_then(|e| e.to_str())
167            .map(|ext| match ext {
168                "py" => Language::Python,
169                "js" | "mjs" | "cjs" => Language::JavaScript,
170                "ts" => Language::TypeScript,
171                "jsx" => Language::Jsx,
172                "tsx" => Language::Tsx,
173                "rs" => Language::Rust,
174                "go" => Language::Go,
175                "java" => Language::Java,
176                "kt" | "kts" => Language::Kotlin,
177                "c" | "cpp" | "cc" | "cxx" | "h" | "hpp" => Language::CCpp,
178                "cs" => Language::CSharp,
179                "rb" => Language::Ruby,
180                "php" => Language::Php,
181                "swift" => Language::Swift,
182                "hs" => Language::Haskell,
183                "lua" => Language::Lua,
184                "pl" | "pm" => Language::Perl,
185                "r" | "R" => Language::R,
186                "scala" => Language::Scala,
187                "sh" | "bash" | "zsh" | "fish" => Language::Shell,
188                _ => Language::Unknown,
189            })
190            .unwrap_or(Language::Unknown)
191    }
192
193    /// Returns true if tree-sitter supports this language.
194    pub fn has_tree_sitter(self) -> bool {
195        match self {
196            #[cfg(feature = "python")]
197            Language::Python => true,
198            #[cfg(feature = "javascript")]
199            Language::JavaScript | Language::Jsx => true,
200            #[cfg(feature = "typescript")]
201            Language::TypeScript | Language::Tsx => true,
202            #[cfg(feature = "rust")]
203            Language::Rust => true,
204            #[cfg(feature = "go")]
205            Language::Go => true,
206            #[cfg(feature = "java")]
207            Language::Java => true,
208            #[cfg(feature = "cpp")]
209            Language::CCpp => true,
210            #[cfg(feature = "c-sharp")]
211            Language::CSharp => true,
212            #[cfg(feature = "php")]
213            Language::Php => true,
214            #[cfg(feature = "ruby")]
215            Language::Ruby => true,
216            #[cfg(feature = "haskell")]
217            Language::Haskell => true,
218            #[cfg(feature = "lua")]
219            Language::Lua => true,
220            #[cfg(feature = "scala")]
221            Language::Scala => true,
222            _ => false,
223        }
224    }
225
226    /// Returns true if tree-sitter supports this language.
227    #[cfg(not(feature = "tree-sitter"))]
228    pub fn has_tree_sitter(self) -> bool {
229        false
230    }
231}
232
233/// Comment extractor trait.
234pub trait CommentExtractor {
235    /// Extract all comments from the given source code.
236    fn extract(&self, source: &str) -> Vec<Comment>;
237}
238
239/// The main scanner.
240pub struct Scanner {
241    registry: PatternRegistry,
242}
243
244impl Scanner {
245    /// Create a new scanner with the given patterns.
246    pub fn new(patterns: Vec<Pattern>) -> Result<Self> {
247        let registry = PatternRegistry::new(patterns)?;
248        Ok(Self { registry })
249    }
250
251    /// Scan a single file.
252    pub fn scan_file(&self, path: &str, content: &str) -> FileScanResult {
253        let lang = Language::from_path(Path::new(path));
254        let mut comment_findings = self.findings_from_comments(path, lang, content);
255
256        // Also run AST-level detection if available
257        #[cfg(feature = "tree-sitter")]
258        if lang.has_tree_sitter() {
259            if let Some(mut extractor) = self::tree_sitter::get_extractor(lang) {
260                // Collect pattern references for AST detection
261                let patterns: Vec<&Pattern> =
262                    self.registry.patterns.iter().map(|p| &p.pattern).collect();
263                // Convert Vec<&Pattern> to a slice that lives long enough
264                let pattern_refs: Vec<Pattern> = patterns.iter().map(|p| (**p).clone()).collect();
265                let ast_findings = extractor.extract_ast_findings(content, &pattern_refs);
266
267                // Set file path and add to results
268                for mut finding in ast_findings {
269                    finding.file = path.to_string();
270                    comment_findings.score += finding.severity.score();
271                    comment_findings.findings.push(finding);
272                }
273            }
274        }
275
276        comment_findings
277    }
278
279    /// Extract comments using the best available method.
280    fn extract_comments(&self, lang: Language, source: &str) -> Vec<Comment> {
281        #[cfg(feature = "tree-sitter")]
282        if lang.has_tree_sitter() {
283            if let Some(mut extractor) = self::tree_sitter::get_extractor(lang) {
284                return extractor.extract(source);
285            }
286        }
287
288        // Fallback to regex-based extraction
289        RegexExtractor::new().extract(source)
290    }
291
292    /// Convert comments to findings by matching patterns.
293    fn findings_from_comments(&self, path: &str, lang: Language, source: &str) -> FileScanResult {
294        let mut findings = Vec::new();
295        let mut total_score = 0u32;
296
297        let comments = self.extract_comments(lang, source);
298
299        for comment in &comments {
300            for pattern in &self.registry.patterns {
301                // Skip AST-only patterns for comment-based matching
302                if pattern.pattern.ast_query.is_some() {
303                    continue;
304                }
305
306                if let Some(regex) = &pattern.compiled {
307                    if let Some(mat) = regex.find(&comment.content) {
308                        let severity = pattern.pattern.severity.clone();
309                        total_score += severity.score();
310
311                        findings.push(Finding {
312                            file: path.to_string(),
313                            line: comment.line,
314                            column: comment.column + mat.start(),
315                            severity,
316                            category: pattern.pattern.category.clone(),
317                            message: pattern.pattern.message.clone(),
318                            match_text: mat.as_str().to_string(),
319                            pattern_regex: pattern.pattern.regex.to_string(),
320                        });
321                    }
322                }
323            }
324        }
325
326        FileScanResult {
327            path: path.to_string(),
328            findings,
329            score: total_score,
330        }
331    }
332}
333
334#[cfg(test)]
335mod tests {
336    use super::*;
337    use crate::config::RegexPattern;
338
339    fn test_patterns() -> Vec<Pattern> {
340        vec![
341            Pattern {
342                regex: RegexPattern::new("(?i)TODO:".to_string()).unwrap(),
343                severity: Severity::Medium,
344                message: "Placeholder comment found".to_string(),
345                category: PatternCategory::Placeholder,
346                ast_query: None,
347                languages: vec![],
348            },
349            Pattern {
350                regex: RegexPattern::new("(?i)for now".to_string()).unwrap(),
351                severity: Severity::Low,
352                message: "Deferral phrase detected".to_string(),
353                category: PatternCategory::Deferral,
354                ast_query: None,
355                languages: vec![],
356            },
357        ]
358    }
359
360    #[test]
361    fn test_scan_file_findings() {
362        let scanner = Scanner::new(test_patterns()).unwrap();
363        let code = r#"
364# TODO: implement this later
365# This is fine
366# for now we'll do it this way
367"#;
368        let result = scanner.scan_file("test.py", code);
369        assert_eq!(result.findings.len(), 2);
370        assert_eq!(result.findings[0].category, PatternCategory::Placeholder);
371        assert_eq!(result.findings[1].category, PatternCategory::Deferral);
372    }
373
374    #[test]
375    fn test_score_calculation() {
376        let scanner = Scanner::new(test_patterns()).unwrap();
377        let code = "# TODO: fix this # for now we do this";
378        let result = scanner.scan_file("test.py", code);
379        assert_eq!(result.score, 6);
380    }
381
382    #[test]
383    fn test_language_detection() {
384        assert_eq!(Language::from_path(Path::new("test.py")), Language::Python);
385        assert_eq!(Language::from_path(Path::new("test.rs")), Language::Rust);
386        assert_eq!(
387            Language::from_path(Path::new("test.js")),
388            Language::JavaScript
389        );
390        assert_eq!(Language::from_path(Path::new("test.tsx")), Language::Tsx);
391        assert_eq!(
392            Language::from_path(Path::new("test.xyz")),
393            Language::Unknown
394        );
395    }
396
397    #[test]
398    fn test_language_detection_all_types() {
399        // Test more file extensions
400        assert_eq!(
401            Language::from_path(Path::new("test.ts")),
402            Language::TypeScript
403        );
404        assert_eq!(Language::from_path(Path::new("test.jsx")), Language::Jsx);
405        assert_eq!(Language::from_path(Path::new("test.go")), Language::Go);
406        assert_eq!(Language::from_path(Path::new("test.java")), Language::Java);
407        assert_eq!(Language::from_path(Path::new("test.kt")), Language::Kotlin);
408        assert_eq!(Language::from_path(Path::new("test.kts")), Language::Kotlin);
409        assert_eq!(Language::from_path(Path::new("test.c")), Language::CCpp);
410        assert_eq!(Language::from_path(Path::new("test.cpp")), Language::CCpp);
411        assert_eq!(Language::from_path(Path::new("test.cs")), Language::CSharp);
412        assert_eq!(Language::from_path(Path::new("test.rb")), Language::Ruby);
413        assert_eq!(Language::from_path(Path::new("test.php")), Language::Php);
414        assert_eq!(
415            Language::from_path(Path::new("test.swift")),
416            Language::Swift
417        );
418        assert_eq!(Language::from_path(Path::new("test.hs")), Language::Haskell);
419        assert_eq!(Language::from_path(Path::new("test.lua")), Language::Lua);
420        assert_eq!(Language::from_path(Path::new("test.pl")), Language::Perl);
421        assert_eq!(Language::from_path(Path::new("test.pm")), Language::Perl);
422        assert_eq!(
423            Language::from_path(Path::new("test.scala")),
424            Language::Scala
425        );
426        assert_eq!(Language::from_path(Path::new("test.sh")), Language::Shell);
427        assert_eq!(Language::from_path(Path::new("test.bash")), Language::Shell);
428        assert_eq!(Language::from_path(Path::new("test.zsh")), Language::Shell);
429        assert_eq!(Language::from_path(Path::new("test.fish")), Language::Shell);
430    }
431
432    #[test]
433    fn test_language_from_path_no_extension() {
434        // Test paths without extension
435        assert_eq!(
436            Language::from_path(Path::new("Makefile")),
437            Language::Unknown
438        );
439        assert_eq!(
440            Language::from_path(Path::new(".gitignore")),
441            Language::Unknown
442        );
443        assert_eq!(Language::from_path(Path::new("test")), Language::Unknown);
444    }
445
446    #[test]
447    fn test_comment_struct() {
448        let comment = Comment {
449            line: 10,
450            column: 5,
451            content: "TODO: implement this".to_string(),
452        };
453        assert_eq!(comment.line, 10);
454        assert_eq!(comment.column, 5);
455        assert_eq!(comment.content, "TODO: implement this");
456    }
457
458    #[test]
459    fn test_finding_struct() {
460        let finding = Finding {
461            file: "test.py".to_string(),
462            line: 10,
463            column: 5,
464            severity: Severity::Medium,
465            category: PatternCategory::Placeholder,
466            message: "TODO comment found".to_string(),
467            match_text: "TODO".to_string(),
468            pattern_regex: "(?i)todo".to_string(),
469        };
470        assert_eq!(finding.file, "test.py");
471        assert_eq!(finding.line, 10);
472        assert_eq!(finding.severity, Severity::Medium);
473        assert_eq!(finding.category, PatternCategory::Placeholder);
474    }
475
476    #[test]
477    fn test_file_scan_result_struct() {
478        let result = FileScanResult {
479            path: "test.py".to_string(),
480            findings: vec![],
481            score: 0,
482        };
483        assert_eq!(result.path, "test.py");
484        assert!(result.findings.is_empty());
485        assert_eq!(result.score, 0);
486    }
487
488    #[test]
489    fn test_scan_summary_new_empty() {
490        let results = vec![];
491        let summary = ScanSummary::new(&results);
492        assert_eq!(summary.files_scanned, 0);
493        assert_eq!(summary.files_with_findings, 0);
494        assert_eq!(summary.total_findings, 0);
495        assert_eq!(summary.total_score, 0);
496    }
497
498    #[test]
499    fn test_scan_summary_new_with_results() {
500        let results = vec![FileScanResult {
501            path: "test.py".to_string(),
502            findings: vec![Finding {
503                file: "test.py".to_string(),
504                line: 1,
505                column: 1,
506                severity: Severity::Medium,
507                category: PatternCategory::Placeholder,
508                message: "TODO".to_string(),
509                match_text: "TODO".to_string(),
510                pattern_regex: "(?i)todo".to_string(),
511            }],
512            score: 5,
513        }];
514        let summary = ScanSummary::new(&results);
515        assert_eq!(summary.files_scanned, 1);
516        assert_eq!(summary.files_with_findings, 1);
517        assert_eq!(summary.total_findings, 1);
518        assert_eq!(summary.total_score, 5);
519        assert_eq!(*summary.by_severity.get(&Severity::Medium).unwrap(), 1);
520        assert_eq!(
521            *summary
522                .by_category
523                .get(&PatternCategory::Placeholder)
524                .unwrap(),
525            1
526        );
527    }
528
529    #[test]
530    fn test_scan_summary_new_empty_results() {
531        let results = vec![
532            FileScanResult {
533                path: "clean.py".to_string(),
534                findings: vec![],
535                score: 0,
536            },
537            FileScanResult {
538                path: "sloppy.py".to_string(),
539                findings: vec![],
540                score: 0,
541            },
542        ];
543        let summary = ScanSummary::new(&results);
544        assert_eq!(summary.files_scanned, 2);
545        assert_eq!(summary.files_with_findings, 0);
546        assert_eq!(summary.total_findings, 0);
547        assert_eq!(summary.total_score, 0);
548    }
549}