Skip to main content

garbage_code_hunter/
analyzer.rs

1use regex::Regex;
2use std::fs;
3use std::path::{Path, PathBuf};
4use walkdir::WalkDir;
5
6use crate::context::{FileContext, ProjectConfig};
7use crate::language::{Language, SUPPORTED_EXTENSIONS};
8use crate::rules::generic::GenericRuleEngine;
9use crate::treesitter::duplication::{CrossFileDupDetector, IntraFileDupDetector};
10use crate::treesitter::{TreeSitterEngine, TreeSitterRuleEngine};
11
12#[derive(Debug, Clone)]
13pub struct CodeIssue {
14    pub file_path: PathBuf,
15    pub line: usize,
16    pub column: usize,
17    pub rule_name: String,
18    pub message: String,
19    pub severity: Severity,
20}
21
22#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
23pub enum Severity {
24    Mild,    // Minor issues
25    Spicy,   // Medium issues
26    Nuclear, // Serious issues
27}
28
29pub struct CodeAnalyzer {
30    generic_engine: GenericRuleEngine,
31    ts_engine: TreeSitterEngine,
32    ts_rule_engine: TreeSitterRuleEngine,
33    exclude_patterns: Vec<Regex>,
34    lang: String,
35}
36
37impl CodeAnalyzer {
38    pub fn rule_names(&self) -> Vec<&'static str> {
39        self.ts_rule_engine.rule_names()
40    }
41
42    pub fn new(exclude_patterns: &[String], lang: &str) -> Self {
43        Self::with_config(exclude_patterns, lang, ProjectConfig::default())
44    }
45
46    pub fn with_config(exclude_patterns: &[String], lang: &str, config: ProjectConfig) -> Self {
47        // Default exclude patterns for common build/dependency directories
48        let default_excludes = [
49            "target",
50            "node_modules",
51            ".git",
52            ".svn",
53            ".hg",
54            "build",
55            "dist",
56            "out",
57            "__pycache__",
58            ".DS_Store",
59        ];
60
61        let mut all_patterns: Vec<String> =
62            default_excludes.iter().map(|s| s.to_string()).collect();
63        all_patterns.extend(exclude_patterns.iter().cloned());
64
65        // Also add exclude patterns from project config
66        all_patterns.extend(config.whitelists.exclude_patterns.clone());
67
68        let patterns = all_patterns
69            .iter()
70            .filter_map(|pattern| {
71                // Convert glob patterns to regular expressions
72                let regex_pattern = pattern
73                    .replace(".", r"\.")
74                    .replace("*", ".*")
75                    .replace("?", ".");
76                Regex::new(&regex_pattern).ok()
77            })
78            .collect();
79
80        let mut ts_rule_engine = TreeSitterRuleEngine::new();
81        crate::treesitter::rules::rust_rules::register_rust_rules(&mut ts_rule_engine);
82
83        Self {
84            generic_engine: GenericRuleEngine::new(),
85            ts_engine: TreeSitterEngine::new(),
86            ts_rule_engine,
87            exclude_patterns: patterns,
88            lang: lang.to_string(),
89        }
90    }
91
92    fn should_exclude(&self, path: &Path) -> bool {
93        let path_str = path.to_string_lossy();
94        self.exclude_patterns
95            .iter()
96            .any(|pattern| pattern.is_match(&path_str))
97    }
98
99    pub fn analyze_path(&self, path: &Path) -> Vec<CodeIssue> {
100        if path.is_file() {
101            if !self.should_exclude(path) {
102                let lang = Language::from_path(path);
103                if lang != Language::Unknown {
104                    return self.analyze_file(path);
105                }
106            }
107            return Vec::new();
108        }
109
110        if !path.is_dir() {
111            return Vec::new();
112        }
113
114        // Collect all supported source files
115        let files: Vec<PathBuf> = WalkDir::new(path)
116            .into_iter()
117            .filter_map(|e| e.ok())
118            .filter(|e| !self.should_exclude(e.path()))
119            .filter(|e| {
120                e.path()
121                    .extension()
122                    .and_then(|ext| ext.to_str())
123                    .is_some_and(|ext| SUPPORTED_EXTENSIONS.contains(&ext))
124            })
125            .map(|e| e.path().to_path_buf())
126            .collect();
127
128        // Phase 1: Parallel single-file analysis for all languages
129        let mut issues: Vec<CodeIssue> = files
130            .iter()
131            .flat_map(|file_path| self.analyze_file(file_path))
132            .collect();
133
134        // Phase 2: Cross-file duplication detection (tree-sitter based)
135        let mut cross_detector = CrossFileDupDetector::new();
136        for file_path in &files {
137            if let Ok(content) = fs::read_to_string(file_path) {
138                if let Some(parsed) = self.ts_engine.parse_file(file_path, &content) {
139                    cross_detector.process_file(&parsed);
140                }
141            }
142        }
143        issues.extend(cross_detector.find_duplicates());
144        issues.extend(cross_detector.find_near_duplicates());
145
146        // Phase 3: Intra-file code duplication
147        for file_path in &files {
148            if let Ok(content) = fs::read_to_string(file_path) {
149                if let Some(parsed) = self.ts_engine.parse_file(file_path, &content) {
150                    issues.extend(IntraFileDupDetector::check(&parsed));
151                }
152            }
153        }
154
155        issues
156    }
157
158    pub fn analyze_file(&self, file_path: &Path) -> Vec<CodeIssue> {
159        let content = match fs::read_to_string(file_path) {
160            Ok(content) => content,
161            Err(_) => return vec![],
162        };
163
164        let lang = Language::from_path(file_path);
165        let is_test_file = Self::is_test_file(file_path, &content);
166
167        // Use tree-sitter for all languages with grammar support
168        if let Some(parsed) = self.ts_engine.parse_file(file_path, &content) {
169            let context = FileContext::from_path(file_path);
170            self.ts_rule_engine.check_file_with_context(
171                &parsed,
172                is_test_file,
173                &context,
174                &ProjectConfig::default(),
175            )
176        } else if lang == Language::C || lang == Language::Cpp {
177            // Fallback to generic text-based rules for C/C++
178            self.generic_engine
179                .check_file(file_path, &content, &self.lang)
180        } else {
181            vec![]
182        }
183    }
184
185    fn is_test_file(path: &Path, content: &str) -> bool {
186        let path_str = path.to_string_lossy();
187        // Normalize: strip leading "./" for consistent matching
188        let normalized = path_str.strip_prefix("./").unwrap_or(&path_str);
189
190        // Check file path patterns (Rust + C/C++)
191        if normalized.contains("/tests/")
192            || normalized.contains("\\tests\\")
193            || normalized.starts_with("tests/")
194            || normalized.starts_with("tests\\")
195            || normalized.contains("/test/")
196            || normalized.contains("\\test\\")
197            || normalized.ends_with("_test.rs")
198            || normalized.ends_with("_tests.rs")
199            || normalized.ends_with("_test.c")
200            || normalized.ends_with("_test.cpp")
201            || normalized.ends_with("_test.cc")
202            || normalized.starts_with("test_")
203        {
204            return true;
205        }
206        // Check for example files (singular and plural)
207        if normalized.contains("/examples/")
208            || normalized.contains("\\examples\\")
209            || normalized.starts_with("examples/")
210            || normalized.starts_with("examples\\")
211            || normalized.contains("/example/")
212            || normalized.contains("\\example\\")
213            || normalized.starts_with("example/")
214            || normalized.starts_with("example\\")
215            || normalized.ends_with("_example.rs")
216            || normalized.ends_with("_examples.rs")
217        {
218            return true;
219        }
220        // Check for benchmark files
221        if normalized.contains("/benches/")
222            || normalized.contains("\\benches\\")
223            || normalized.starts_with("benches/")
224            || normalized.starts_with("benches\\")
225            || normalized.ends_with("_bench.rs")
226            || normalized.ends_with("_benches.rs")
227        {
228            return true;
229        }
230        // Check for test-files directories
231        if normalized.contains("/test-files/")
232            || normalized.contains("\\test-files\\")
233            || normalized.starts_with("test-files/")
234            || normalized.starts_with("test-files\\")
235            || normalized.contains("/test_files/")
236            || normalized.contains("\\test_files\\")
237        {
238            return true;
239        }
240        // Check for fixture/mock directories
241        if normalized.contains("/fixtures/")
242            || normalized.contains("\\fixtures\\")
243            || normalized.contains("/mocks/")
244            || normalized.contains("\\mocks\\")
245        {
246            return true;
247        }
248        // Check for #[cfg(test)] module in content (Rust)
249        content.contains("#[cfg(test)]")
250    }
251}