garbage_code_hunter/rules/
duplication.rs1use std::collections::HashMap;
2use std::path::Path;
3use std::sync::OnceLock;
4
5use regex::Regex;
6use syn::{visit::Visit, Block, File};
7
8use crate::analyzer::{CodeIssue, Severity};
9use crate::rules::Rule;
10use crate::utils::get_position;
11
12static STRING_LITERAL_REGEX: OnceLock<Regex> = OnceLock::new();
14
15static RUST_COMMON_PATTERN_STRINGS: &[&str] = &[
17 r"self\.\w+\.push\(\w+::\{",
19 r"\w+\s*\{",
20 r"file_path:\s*self\.\w+\.clone\(\)",
21 r"rule_name:\s*.*\.to_string\(\)",
22 r"message:\s*messages\[",
23 r"severity:\s*Severity::",
24 r"\.clone\(\)",
26 r"\.to_string\(\)",
27 r"\.to_lowercase\(\)",
28 r"\.len\(\)",
29 r"\.is_empty\(\)",
30 r"\.unwrap\(\)",
31 r"\.expect\(",
32 r"if\s+.*\s*\{",
34 r"for\s+.*\s+in\s+",
35 r"match\s+.*\s*\{",
36 r"let\s+.*=.*;",
37 r"\.push\(",
39 r"\.insert\(",
40 r"\.get\(",
41 r".*get_or_insert",
42 r"\.entry\(",
43 r"fn\s+visit_\w+",
45 r"syn::visit::visit_\w+",
46];
47
48static COMPILED_RUST_PATTERNS: OnceLock<Vec<Regex>> = OnceLock::new();
50
51static PATTERN_WARNING: OnceLock<Option<String>> = OnceLock::new();
53
54fn get_compiled_rust_patterns() -> &'static [Regex] {
55 COMPILED_RUST_PATTERNS.get_or_init(|| {
56 let mut compiled = Vec::with_capacity(RUST_COMMON_PATTERN_STRINGS.len());
57 let mut errors = Vec::new();
58 let total = RUST_COMMON_PATTERN_STRINGS.len();
59
60 for (index, pattern) in RUST_COMMON_PATTERN_STRINGS.iter().enumerate() {
61 match Regex::new(pattern) {
62 Ok(regex) => compiled.push(regex),
63 Err(e) => {
64 let error_msg = format!(
65 "[{}] Invalid regex pattern at index {}: '{}'\n Error: {}",
66 file!(),
67 index,
68 pattern,
69 e
70 );
71 eprintln!("⚠️ WARNING: {}", error_msg);
72 errors.push(error_msg);
73 }
74 }
75
76 let _ = index;
77 }
78
79 if !errors.is_empty() {
80 let warning = if compiled.is_empty() {
81 format!(
82 "🚨 CRITICAL: All {} regex patterns failed to compile!\n\
83 Code duplication detection is DISABLED.\n\
84 Errors:\n{}",
85 total,
86 errors.join("\n")
87 )
88 } else {
89 format!(
90 "⚠️ WARNING: {}/{} regex patterns failed to compile.\n\
91 Code duplication detection will use remaining {} patterns.\n\
92 Failed patterns:\n{}",
93 errors.len(),
94 total,
95 compiled.len(),
96 errors
97 .iter()
98 .map(|e| e.lines().next().unwrap_or("").to_string())
99 .collect::<Vec<_>>()
100 .join(", ")
101 )
102 };
103
104 eprintln!("\n{}\n", warning);
105
106 let _ = PATTERN_WARNING.set(Some(warning));
107 }
108
109 if compiled.is_empty() && !RUST_COMMON_PATTERN_STRINGS.is_empty() {
110 eprintln!(
111 "🚨 Falling back to empty pattern list. \
112 Code-duplication rule will have reduced detection capability.\n"
113 );
114 }
115
116 compiled
117 })
118}
119
120fn get_pattern_warning() -> Option<&'static str> {
121 PATTERN_WARNING.get_or_init(|| None).as_deref()
122}
123
124pub struct CodeDuplicationRule;
126
127impl Rule for CodeDuplicationRule {
128 fn name(&self) -> &'static str {
129 "code-duplication"
130 }
131
132 fn check(
133 &self,
134 file_path: &Path,
135 syntax_tree: &File,
136 content: &str,
137 lang: &str,
138 is_test_file: bool,
139 ) -> Vec<CodeIssue> {
140 if is_test_file {
141 return Vec::new();
142 }
143
144 if let Some(warning) = get_pattern_warning() {
145 eprintln!("\n⚠️ [code-duplication] {}\n", warning);
146 }
147
148 let mut visitor = DuplicationVisitor::new(file_path.to_path_buf(), content, lang);
149 visitor.visit_file(syntax_tree);
150 visitor.find_duplications()
151 }
152}
153
154struct DuplicationVisitor {
155 file_path: std::path::PathBuf,
156 content: String,
157 code_blocks: Vec<(String, usize)>,
158 line_hashes: HashMap<String, Vec<usize>>,
159 lang: String,
160}
161
162impl DuplicationVisitor {
163 fn new(file_path: std::path::PathBuf, content: &str, lang: &str) -> Self {
164 Self {
165 file_path,
166 content: content.to_string(),
167 code_blocks: Vec::new(),
168 line_hashes: HashMap::new(),
169 lang: lang.to_string(),
170 }
171 }
172
173 fn find_duplications(&mut self) -> Vec<CodeIssue> {
174 let mut issues = Vec::new();
175
176 self.detect_line_duplications(&mut issues);
178
179 self.detect_block_duplications(&mut issues);
181
182 self.detect_consecutive_duplications(&mut issues);
184
185 issues
186 }
187
188 fn detect_line_duplications(&mut self, issues: &mut Vec<CodeIssue>) {
189 let lines: Vec<&str> = self.content.lines().collect();
190
191 for (line_num, line) in lines.iter().enumerate() {
192 let trimmed = line.trim();
193
194 if trimmed.is_empty()
196 || trimmed.starts_with("//")
197 || trimmed.starts_with("/*")
198 || trimmed.starts_with("*")
199 || trimmed.len() < 15
200 || is_simple_statement(trimmed)
201 {
202 continue;
203 }
204
205 if is_common_rust_pattern(trimmed) {
207 continue;
208 }
209
210 if is_string_literal_line(trimmed) {
212 continue;
213 }
214
215 if is_struct_initialization(trimmed) {
217 continue;
218 }
219
220 let normalized = normalize_line_smart(trimmed);
221 if normalized.len() < 10 {
222 continue;
223 }
224
225 self.line_hashes
226 .entry(normalized)
227 .or_default()
228 .push(line_num + 1);
229 }
230
231 for line_numbers in self.line_hashes.values() {
233 let count = line_numbers.len();
234
235 if count >= 25 {
238 let messages = self.generate_dup_messages(count);
239
240 let severity = if count >= 40 {
241 Severity::Nuclear
242 } else if count >= 30 {
243 Severity::Spicy
244 } else {
245 Severity::Mild
246 };
247
248 issues.push(CodeIssue {
249 file_path: self.file_path.clone(),
250 line: line_numbers[0],
251 column: 1,
252 rule_name: "code-duplication".to_string(),
253 message: messages[issues.len() % messages.len()].clone(),
254 severity,
255 });
256
257 if issues.len() >= 3 {
259 break;
260 }
261 }
262 }
263 }
264
265 fn detect_block_duplications(&self, issues: &mut Vec<CodeIssue>) {
266 let mut block_signatures: HashMap<String, Vec<usize>> = HashMap::new();
267
268 for (i, (block_str, _line)) in self.code_blocks.iter().enumerate() {
269 if block_str.len() > 500 {
270 let signature = generate_block_signature_smart(block_str);
271 block_signatures.entry(signature).or_default().push(i);
272 }
273 }
274
275 for (_, block_indices) in block_signatures {
276 if block_indices.len() >= 8 {
277 let messages = if self.lang == "zh-CN" {
278 vec![
279 format!("发现 {} 个相似代码块,考虑重构成函数", block_indices.len()),
280 "代码块重复度过高,DRY原则哭了".to_string(),
281 format!("检测到 {} 个相似代码块,重构时间到了", block_indices.len()),
282 ]
283 } else {
284 vec![
285 format!(
286 "Similar code blocks detected: {} instances",
287 block_indices.len()
288 ),
289 format!(
290 "Refactoring opportunity: {} similar blocks found",
291 block_indices.len()
292 ),
293 "Code block duplication too high, DRY principle is crying".to_string(),
294 ]
295 };
296
297 let line = self.code_blocks[block_indices[0]].1;
298
299 issues.push(CodeIssue {
300 file_path: self.file_path.clone(),
301 line,
302 column: 1,
303 rule_name: "code-duplication".to_string(),
304 message: messages[issues.len() % messages.len()].clone(),
305 severity: Severity::Spicy,
306 });
307 }
308 }
309 }
310
311 fn detect_consecutive_duplications(&self, issues: &mut Vec<CodeIssue>) {
314 let lines: Vec<&str> = self.content.lines().collect();
315 let mut i = 0;
316
317 while i < lines.len().saturating_sub(3) {
318 let current = normalize_line_smart(lines[i].trim());
319
320 if current.is_empty() || current.len() < 15 {
321 i += 1;
322 continue;
323 }
324
325 let mut dup_count = 1;
327 let mut start_line = i + 1;
328
329 while start_line < lines.len() && dup_count < 5 {
330 let next_normalized = normalize_line_smart(lines[start_line].trim());
331 if next_normalized == current && !is_common_rust_pattern(lines[start_line].trim()) {
332 dup_count += 1;
333 start_line += 1;
334 } else {
335 break;
336 }
337 }
338
339 if dup_count >= 4 {
341 let messages = if self.lang == "zh-CN" {
342 vec![
343 format!("发现连续 {} 行完全相同的代码!这是复制粘贴!", dup_count),
344 format!("{} 行重复代码块,建议提取为函数或宏", dup_count),
345 ]
346 } else {
347 vec![
348 format!(
349 "Found {} consecutive identical lines! This looks like copy-paste!",
350 dup_count
351 ),
352 format!(
353 "{} line duplicate block detected - consider extracting to function/macro",
354 dup_count
355 ),
356 ]
357 };
358
359 issues.push(CodeIssue {
360 file_path: self.file_path.clone(),
361 line: i + 1,
362 column: 1,
363 rule_name: "code-duplication".to_string(),
364 message: messages[0].clone(),
365 severity: Severity::Spicy,
366 });
367
368 i = start_line; } else {
370 i += 1;
371 }
372 }
373 }
374
375 fn generate_dup_messages(&self, count: usize) -> Vec<String> {
376 if self.lang == "zh-CN" {
377 vec![
378 format!("检测到 {} 次重复代码!你是复制粘贴大师吗?", count),
379 format!("这行代码重复了 {} 次,建议提取成函数", count),
380 format!("重复代码警报!{} 次重复让维护变成噩梦", count),
381 format!("复制粘贴忍者出现!{} 行相同代码", count),
382 format!("违反 DRY 原则:{} 行重复代码", count),
383 ]
384 } else {
385 vec![
386 format!("Copy-paste ninja detected! {} identical lines found", count),
387 format!("DRY principle violation: {} duplicated lines", count),
388 format!("Code duplication alert! {} repetitions found", count),
389 format!(
390 "This line repeated {} times - consider extracting to function",
391 count
392 ),
393 format!("Maintenance nightmare: {} duplicate lines detected", count),
394 ]
395 }
396 }
397}
398
399impl<'ast> Visit<'ast> for DuplicationVisitor {
400 fn visit_block(&mut self, block: &'ast Block) {
401 let block_str = format!("{block:?}");
402 if block_str.len() > 50 {
403 let (line, _) = get_position(block);
404 self.code_blocks.push((block_str, line));
405 }
406 syn::visit::visit_block(self, block);
407 }
408}
409
410fn normalize_line_smart(line: &str) -> String {
412 let re = STRING_LITERAL_REGEX.get_or_init(|| Regex::new(r#""[^"]*""#).unwrap());
413
414 let stripped = re.replace_all(line.trim(), "STR");
415
416 stripped.replace(char::is_whitespace, "").to_lowercase()
417}
418
419fn is_common_rust_pattern(line: &str) -> bool {
421 let trimmed = line.trim();
422
423 for pattern in get_compiled_rust_patterns().iter() {
424 if pattern.is_match(trimmed) {
425 return true;
426 }
427 }
428
429 false
430}
431
432fn is_struct_initialization(line: &str) -> bool {
434 let trimmed = line.trim();
435
436 if trimmed.contains('{') && trimmed.contains('}') {
438 let field_count = trimmed.matches(':').count();
440
441 if field_count >= 3 {
443 return true;
444 }
445 }
446
447 if trimmed.contains(".push(") && trimmed.contains("{") {
449 return true;
450 }
451
452 if Regex::new(r"\w+\s*\{[^}]*file_path:")
454 .map(|re| re.is_match(trimmed))
455 .unwrap_or(false)
456 {
457 return true;
458 }
459
460 false
461}
462
463fn is_simple_statement(line: &str) -> bool {
464 matches!(line.trim(), "{" | "}" | ";" | "(" | ")" | "[" | "]")
465}
466
467fn is_string_literal_line(line: &str) -> bool {
468 let trimmed = line.trim();
469
470 if trimmed.starts_with('"') && trimmed.ends_with('"') {
471 return true;
472 }
473 if trimmed.starts_with('"') && (trimmed.ends_with("\",") || trimmed.ends_with(',')) {
474 return true;
475 }
476 if trimmed.starts_with("format!") || trimmed.starts_with("format!(") {
477 return true;
478 }
479 if trimmed.starts_with("\"") && !trimmed.contains("fn ") && !trimmed.contains("let ") {
480 return true;
481 }
482 false
483}
484
485fn generate_block_signature_smart(block: &str) -> String {
487 block
488 .chars()
489 .filter(|c| !c.is_whitespace())
490 .take(300)
491 .collect::<String>()
492 .to_lowercase()
493}
494
495pub fn get_rust_patterns_for_testing() -> &'static [&'static str] {
502 RUST_COMMON_PATTERN_STRINGS
503}