Skip to main content

garbage_code_hunter/rules/
duplication.rs

1use std::collections::HashMap;
2use std::path::Path;
3use std::sync::OnceLock;
4
5use regex::Regex;
6use syn::{visit::Visit, Block, File};
7
8use crate::analyzer::{CodeIssue, Severity};
9use crate::rules::Rule;
10use crate::utils::get_position;
11
12/// Static regex for string literal stripping, compiled once for performance.
13static STRING_LITERAL_REGEX: OnceLock<Regex> = OnceLock::new();
14
15/// Raw pattern strings for common Rust patterns (not compiled)
16static RUST_COMMON_PATTERN_STRINGS: &[&str] = &[
17    // Struct initialization patterns
18    r"self\.\w+\.push\(\w+::\{",
19    r"\w+\s*\{",
20    r"file_path:\s*self\.\w+\.clone\(\)",
21    r"rule_name:\s*.*\.to_string\(\)",
22    r"message:\s*messages\[",
23    r"severity:\s*Severity::",
24    // Common method chains
25    r"\.clone\(\)",
26    r"\.to_string\(\)",
27    r"\.to_lowercase\(\)",
28    r"\.len\(\)",
29    r"\.is_empty\(\)",
30    r"\.unwrap\(\)",
31    r"\.expect\(",
32    // Common control flow
33    r"if\s+.*\s*\{",
34    r"for\s+.*\s+in\s+",
35    r"match\s+.*\s*\{",
36    r"let\s+.*=.*;",
37    // Common collection operations
38    r"\.push\(",
39    r"\.insert\(",
40    r"\.get\(",
41    r".*get_or_insert",
42    r"\.entry\(",
43    // Visitor pattern (very common in Rust analyzers)
44    r"fn\s+visit_\w+",
45    r"syn::visit::visit_\w+",
46];
47
48/// Pre-compiled regex patterns using OnceLock for performance
49static COMPILED_RUST_PATTERNS: OnceLock<Vec<Regex>> = OnceLock::new();
50
51/// Warning message for skipped patterns (set once at initialization)
52static PATTERN_WARNING: OnceLock<Option<String>> = OnceLock::new();
53
54fn get_compiled_rust_patterns() -> &'static [Regex] {
55    COMPILED_RUST_PATTERNS.get_or_init(|| {
56        let mut compiled = Vec::with_capacity(RUST_COMMON_PATTERN_STRINGS.len());
57        let mut errors = Vec::new();
58        let total = RUST_COMMON_PATTERN_STRINGS.len();
59
60        for (index, pattern) in RUST_COMMON_PATTERN_STRINGS.iter().enumerate() {
61            match Regex::new(pattern) {
62                Ok(regex) => compiled.push(regex),
63                Err(e) => {
64                    let error_msg = format!(
65                        "[{}] Invalid regex pattern at index {}: '{}'\n       Error: {}",
66                        file!(),
67                        index,
68                        pattern,
69                        e
70                    );
71                    eprintln!("⚠️  WARNING: {}", error_msg);
72                    errors.push(error_msg);
73                }
74            }
75
76            let _ = index;
77        }
78
79        if !errors.is_empty() {
80            let warning = if compiled.is_empty() {
81                format!(
82                    "🚨 CRITICAL: All {} regex patterns failed to compile!\n\
83                     Code duplication detection is DISABLED.\n\
84                     Errors:\n{}",
85                    total,
86                    errors.join("\n")
87                )
88            } else {
89                format!(
90                    "⚠️  WARNING: {}/{} regex patterns failed to compile.\n\
91                     Code duplication detection will use remaining {} patterns.\n\
92                     Failed patterns:\n{}",
93                    errors.len(),
94                    total,
95                    compiled.len(),
96                    errors
97                        .iter()
98                        .map(|e| e.lines().next().unwrap_or("").to_string())
99                        .collect::<Vec<_>>()
100                        .join(", ")
101                )
102            };
103
104            eprintln!("\n{}\n", warning);
105
106            let _ = PATTERN_WARNING.set(Some(warning));
107        }
108
109        if compiled.is_empty() && !RUST_COMMON_PATTERN_STRINGS.is_empty() {
110            eprintln!(
111                "🚨 Falling back to empty pattern list. \
112                 Code-duplication rule will have reduced detection capability.\n"
113            );
114        }
115
116        compiled
117    })
118}
119
120fn get_pattern_warning() -> Option<&'static str> {
121    PATTERN_WARNING.get_or_init(|| None).as_deref()
122}
123
124/// code duplication detection rule with smart anti-false-positive logic
125pub struct CodeDuplicationRule;
126
127impl Rule for CodeDuplicationRule {
128    fn name(&self) -> &'static str {
129        "code-duplication"
130    }
131
132    fn check(
133        &self,
134        file_path: &Path,
135        syntax_tree: &File,
136        content: &str,
137        lang: &str,
138        is_test_file: bool,
139    ) -> Vec<CodeIssue> {
140        if is_test_file {
141            return Vec::new();
142        }
143
144        if let Some(warning) = get_pattern_warning() {
145            eprintln!("\n⚠️  [code-duplication] {}\n", warning);
146        }
147
148        let mut visitor = DuplicationVisitor::new(file_path.to_path_buf(), content, lang);
149        visitor.visit_file(syntax_tree);
150        visitor.find_duplications()
151    }
152}
153
154struct DuplicationVisitor {
155    file_path: std::path::PathBuf,
156    content: String,
157    code_blocks: Vec<(String, usize)>,
158    line_hashes: HashMap<String, Vec<usize>>,
159    lang: String,
160}
161
162impl DuplicationVisitor {
163    fn new(file_path: std::path::PathBuf, content: &str, lang: &str) -> Self {
164        Self {
165            file_path,
166            content: content.to_string(),
167            code_blocks: Vec::new(),
168            line_hashes: HashMap::new(),
169            lang: lang.to_string(),
170        }
171    }
172
173    fn find_duplications(&mut self) -> Vec<CodeIssue> {
174        let mut issues = Vec::new();
175
176        // detect LINE-LEVEL duplications (with smart filtering)
177        self.detect_line_duplications(&mut issues);
178
179        // detect BLOCK-LEVEL duplications (multi-line copy-paste)
180        self.detect_block_duplications(&mut issues);
181
182        // detect CONSECUTIVE duplications (the real copy-paste)
183        self.detect_consecutive_duplications(&mut issues);
184
185        issues
186    }
187
188    fn detect_line_duplications(&mut self, issues: &mut Vec<CodeIssue>) {
189        let lines: Vec<&str> = self.content.lines().collect();
190
191        for (line_num, line) in lines.iter().enumerate() {
192            let trimmed = line.trim();
193
194            // ignore empty lines, comments, and simple statements
195            if trimmed.is_empty()
196                || trimmed.starts_with("//")
197                || trimmed.starts_with("/*")
198                || trimmed.starts_with("*")
199                || trimmed.len() < 15
200                || is_simple_statement(trimmed)
201            {
202                continue;
203            }
204
205            // Skip lines that match common Rust patterns (anti-false-positive)
206            if is_common_rust_pattern(trimmed) {
207                continue;
208            }
209
210            // Skip lines that are inside string literals
211            if is_string_literal_line(trimmed) {
212                continue;
213            }
214
215            // Skip struct initialization patterns (very common in Rust)
216            if is_struct_initialization(trimmed) {
217                continue;
218            }
219
220            let normalized = normalize_line_smart(trimmed);
221            if normalized.len() < 10 {
222                continue;
223            }
224
225            self.line_hashes
226                .entry(normalized)
227                .or_default()
228                .push(line_num + 1);
229        }
230
231        // find duplicate lines with HIGHER threshold to reduce false positives
232        for line_numbers in self.line_hashes.values() {
233            let count = line_numbers.len();
234
235            // Increased threshold: need at least 25 repetitions (was 10)
236            // This filters out common patterns like struct initialization
237            if count >= 25 {
238                let messages = self.generate_dup_messages(count);
239
240                let severity = if count >= 40 {
241                    Severity::Nuclear
242                } else if count >= 30 {
243                    Severity::Spicy
244                } else {
245                    Severity::Mild
246                };
247
248                issues.push(CodeIssue {
249                    file_path: self.file_path.clone(),
250                    line: line_numbers[0],
251                    column: 1,
252                    rule_name: "code-duplication".to_string(),
253                    message: messages[issues.len() % messages.len()].clone(),
254                    severity,
255                });
256
257                // Only report top 3 instances to avoid spam
258                if issues.len() >= 3 {
259                    break;
260                }
261            }
262        }
263    }
264
265    fn detect_block_duplications(&self, issues: &mut Vec<CodeIssue>) {
266        let mut block_signatures: HashMap<String, Vec<usize>> = HashMap::new();
267
268        for (i, (block_str, _line)) in self.code_blocks.iter().enumerate() {
269            if block_str.len() > 500 {
270                let signature = generate_block_signature_smart(block_str);
271                block_signatures.entry(signature).or_default().push(i);
272            }
273        }
274
275        for (_, block_indices) in block_signatures {
276            if block_indices.len() >= 8 {
277                let messages = if self.lang == "zh-CN" {
278                    vec![
279                        format!("发现 {} 个相似代码块,考虑重构成函数", block_indices.len()),
280                        "代码块重复度过高,DRY原则哭了".to_string(),
281                        format!("检测到 {} 个相似代码块,重构时间到了", block_indices.len()),
282                    ]
283                } else {
284                    vec![
285                        format!(
286                            "Similar code blocks detected: {} instances",
287                            block_indices.len()
288                        ),
289                        format!(
290                            "Refactoring opportunity: {} similar blocks found",
291                            block_indices.len()
292                        ),
293                        "Code block duplication too high, DRY principle is crying".to_string(),
294                    ]
295                };
296
297                let line = self.code_blocks[block_indices[0]].1;
298
299                issues.push(CodeIssue {
300                    file_path: self.file_path.clone(),
301                    line,
302                    column: 1,
303                    rule_name: "code-duplication".to_string(),
304                    message: messages[issues.len() % messages.len()].clone(),
305                    severity: Severity::Spicy,
306                });
307            }
308        }
309    }
310
311    /// Detect consecutive duplicate lines (REAL copy-paste detection)
312    /// This finds actual copy-pasted code blocks, not just similar patterns
313    fn detect_consecutive_duplications(&self, issues: &mut Vec<CodeIssue>) {
314        let lines: Vec<&str> = self.content.lines().collect();
315        let mut i = 0;
316
317        while i < lines.len().saturating_sub(3) {
318            let current = normalize_line_smart(lines[i].trim());
319
320            if current.is_empty() || current.len() < 15 {
321                i += 1;
322                continue;
323            }
324
325            // Look ahead for consecutive identical patterns
326            let mut dup_count = 1;
327            let mut start_line = i + 1;
328
329            while start_line < lines.len() && dup_count < 5 {
330                let next_normalized = normalize_line_smart(lines[start_line].trim());
331                if next_normalized == current && !is_common_rust_pattern(lines[start_line].trim()) {
332                    dup_count += 1;
333                    start_line += 1;
334                } else {
335                    break;
336                }
337            }
338
339            // Report only if we found 4+ consecutive identical lines
340            if dup_count >= 4 {
341                let messages = if self.lang == "zh-CN" {
342                    vec![
343                        format!("发现连续 {} 行完全相同的代码!这是复制粘贴!", dup_count),
344                        format!("{} 行重复代码块,建议提取为函数或宏", dup_count),
345                    ]
346                } else {
347                    vec![
348                        format!(
349                            "Found {} consecutive identical lines! This looks like copy-paste!",
350                            dup_count
351                        ),
352                        format!(
353                            "{} line duplicate block detected - consider extracting to function/macro",
354                            dup_count
355                        ),
356                    ]
357                };
358
359                issues.push(CodeIssue {
360                    file_path: self.file_path.clone(),
361                    line: i + 1,
362                    column: 1,
363                    rule_name: "code-duplication".to_string(),
364                    message: messages[0].clone(),
365                    severity: Severity::Spicy,
366                });
367
368                i = start_line; // Skip past this block
369            } else {
370                i += 1;
371            }
372        }
373    }
374
375    fn generate_dup_messages(&self, count: usize) -> Vec<String> {
376        if self.lang == "zh-CN" {
377            vec![
378                format!("检测到 {} 次重复代码!你是复制粘贴大师吗?", count),
379                format!("这行代码重复了 {} 次,建议提取成函数", count),
380                format!("重复代码警报!{} 次重复让维护变成噩梦", count),
381                format!("复制粘贴忍者出现!{} 行相同代码", count),
382                format!("违反 DRY 原则:{} 行重复代码", count),
383            ]
384        } else {
385            vec![
386                format!("Copy-paste ninja detected! {} identical lines found", count),
387                format!("DRY principle violation: {} duplicated lines", count),
388                format!("Code duplication alert! {} repetitions found", count),
389                format!(
390                    "This line repeated {} times - consider extracting to function",
391                    count
392                ),
393                format!("Maintenance nightmare: {} duplicate lines detected", count),
394            ]
395        }
396    }
397}
398
399impl<'ast> Visit<'ast> for DuplicationVisitor {
400    fn visit_block(&mut self, block: &'ast Block) {
401        let block_str = format!("{block:?}");
402        if block_str.len() > 50 {
403            let (line, _) = get_position(block);
404            self.code_blocks.push((block_str, line));
405        }
406        syn::visit::visit_block(self, block);
407    }
408}
409
410/// Smart normalization that preserves semantic differences
411fn normalize_line_smart(line: &str) -> String {
412    let re = STRING_LITERAL_REGEX.get_or_init(|| Regex::new(r#""[^"]*""#).unwrap());
413
414    let stripped = re.replace_all(line.trim(), "STR");
415
416    stripped.replace(char::is_whitespace, "").to_lowercase()
417}
418
419/// Check if a line matches common Rust patterns that should be ignored
420fn is_common_rust_pattern(line: &str) -> bool {
421    let trimmed = line.trim();
422
423    for pattern in get_compiled_rust_patterns().iter() {
424        if pattern.is_match(trimmed) {
425            return true;
426        }
427    }
428
429    false
430}
431
432/// Check if this line is a struct initialization pattern
433fn is_struct_initialization(line: &str) -> bool {
434    let trimmed = line.trim();
435
436    // Pattern: SomeStruct { field: value, ... }
437    if trimmed.contains('{') && trimmed.contains('}') {
438        // Count the number of fields being set
439        let field_count = trimmed.matches(':').count();
440
441        // If it has multiple fields (>= 3), it's likely a struct init
442        if field_count >= 3 {
443            return true;
444        }
445    }
446
447    // Pattern: self.issues.push(CodeIssue { ... })
448    if trimmed.contains(".push(") && trimmed.contains("{") {
449        return true;
450    }
451
452    // Pattern: CodeIssue { ... } or similar struct literals
453    if Regex::new(r"\w+\s*\{[^}]*file_path:")
454        .map(|re| re.is_match(trimmed))
455        .unwrap_or(false)
456    {
457        return true;
458    }
459
460    false
461}
462
463fn is_simple_statement(line: &str) -> bool {
464    matches!(line.trim(), "{" | "}" | ";" | "(" | ")" | "[" | "]")
465}
466
467fn is_string_literal_line(line: &str) -> bool {
468    let trimmed = line.trim();
469
470    if trimmed.starts_with('"') && trimmed.ends_with('"') {
471        return true;
472    }
473    if trimmed.starts_with('"') && (trimmed.ends_with("\",") || trimmed.ends_with(',')) {
474        return true;
475    }
476    if trimmed.starts_with("format!") || trimmed.starts_with("format!(") {
477        return true;
478    }
479    if trimmed.starts_with("\"") && !trimmed.contains("fn ") && !trimmed.contains("let ") {
480        return true;
481    }
482    false
483}
484
485/// Smart block signature generation that ignores variable names but preserves structure
486fn generate_block_signature_smart(block: &str) -> String {
487    block
488        .chars()
489        .filter(|c| !c.is_whitespace())
490        .take(300)
491        .collect::<String>()
492        .to_lowercase()
493}
494
495// ============================================================
496// Test Helpers (Public API for testing)
497// ============================================================
498
499/// Get the list of Rust common pattern strings for testing purposes.
500/// This provides controlled access to internal patterns without exposing the raw static.
501pub fn get_rust_patterns_for_testing() -> &'static [&'static str] {
502    RUST_COMMON_PATTERN_STRINGS
503}