1use anyhow::Result;
4use serde_json::Value;
5use std::collections::{HashMap, HashSet};
6use std::path::Path;
7use walkdir::WalkDir;
8
9#[derive(Debug, Clone, PartialEq)]
11pub struct AstNode {
12 pub node_type: String,
13 pub children: Vec<AstNode>,
14 pub normalized_text: String,
15 pub structural_hash: u64,
16}
17
18#[derive(Debug, Clone, PartialEq)]
20pub enum DuplicateType {
21 ExactCopy,
22 StructuralSimilar,
23 SemanticSimilar,
24 PatternDuplicate,
25}
26
27#[derive(Debug, Clone)]
29pub struct RefactoringSuggestion {
30 pub suggestion_type: String,
31 pub description: String,
32 pub estimated_effort: String,
33 pub potential_savings: String,
34 pub implementation_steps: Vec<String>,
35}
36
37#[derive(Debug, Clone)]
39pub struct DuplicateResult {
40 pub similarity_score: f64,
41 pub duplicate_type: DuplicateType,
42 pub files: Vec<DuplicateFile>,
43 pub common_patterns: Vec<String>,
44 pub refactoring_suggestions: Vec<RefactoringSuggestion>,
45 pub confidence_level: f64,
46 pub estimated_savings: DuplicateSavings,
47}
48
49#[derive(Debug, Clone)]
50pub struct DuplicateFile {
51 pub path: String,
52 pub lines: usize,
53 pub start_line: Option<usize>,
54 pub end_line: Option<usize>,
55 pub complexity_score: f64,
56}
57
58#[derive(Debug, Clone)]
59pub struct DuplicateSavings {
60 pub lines_of_code: usize,
61 pub maintenance_effort: String,
62 pub bug_risk_reduction: String,
63}
64
65pub struct DuplicateAnalyzer {
67 #[allow(dead_code)] ast_cache: HashMap<String, AstNode>,
70 language_analyzers: HashMap<String, LanguageAnalyzer>,
72 semantic_patterns: HashMap<String, Vec<String>>,
74}
75
76#[derive(Debug, Clone)]
77struct LanguageAnalyzer {
78 keywords: Vec<String>,
79 #[allow(dead_code)] operators: Vec<String>,
81 control_structures: Vec<String>,
82 comment_patterns: Vec<String>,
83}
84
85impl DuplicateAnalyzer {
86 pub fn new() -> Self {
87 let mut analyzer = Self {
88 ast_cache: HashMap::new(),
89 semantic_patterns: HashMap::new(),
90 language_analyzers: HashMap::new(),
91 };
92 analyzer.initialize_language_analyzers();
93 analyzer.initialize_semantic_patterns();
94 analyzer
95 }
96
97 fn initialize_language_analyzers(&mut self) {
98 self.language_analyzers.insert(
100 "py".to_string(),
101 LanguageAnalyzer {
102 keywords: vec![
103 "def".to_string(),
104 "class".to_string(),
105 "if".to_string(),
106 "for".to_string(),
107 "while".to_string(),
108 "try".to_string(),
109 "except".to_string(),
110 "with".to_string(),
111 "import".to_string(),
112 "from".to_string(),
113 "return".to_string(),
114 "yield".to_string(),
115 ],
116 operators: vec![
117 "+".to_string(),
118 "-".to_string(),
119 "*".to_string(),
120 "/".to_string(),
121 "==".to_string(),
122 "!=".to_string(),
123 ">=".to_string(),
124 "<=".to_string(),
125 "and".to_string(),
126 "or".to_string(),
127 "not".to_string(),
128 "in".to_string(),
129 ],
130 control_structures: vec![
131 "if".to_string(),
132 "elif".to_string(),
133 "else".to_string(),
134 "for".to_string(),
135 "while".to_string(),
136 "try".to_string(),
137 "except".to_string(),
138 "finally".to_string(),
139 ],
140 comment_patterns: vec!["#".to_string(), "\"\"\"".to_string(), "'''".to_string()],
141 },
142 );
143
144 self.language_analyzers.insert(
146 "js".to_string(),
147 LanguageAnalyzer {
148 keywords: vec![
149 "function".to_string(),
150 "class".to_string(),
151 "if".to_string(),
152 "for".to_string(),
153 "while".to_string(),
154 "try".to_string(),
155 "catch".to_string(),
156 "const".to_string(),
157 "let".to_string(),
158 "var".to_string(),
159 "return".to_string(),
160 "async".to_string(),
161 "await".to_string(),
162 "import".to_string(),
163 "export".to_string(),
164 ],
165 operators: vec![
166 "+".to_string(),
167 "-".to_string(),
168 "*".to_string(),
169 "/".to_string(),
170 "==".to_string(),
171 "===".to_string(),
172 "!=".to_string(),
173 "!==".to_string(),
174 "&&".to_string(),
175 "||".to_string(),
176 "!".to_string(),
177 ],
178 control_structures: vec![
179 "if".to_string(),
180 "else".to_string(),
181 "switch".to_string(),
182 "case".to_string(),
183 "for".to_string(),
184 "while".to_string(),
185 "do".to_string(),
186 "try".to_string(),
187 "catch".to_string(),
188 "finally".to_string(),
189 ],
190 comment_patterns: vec!["//".to_string(), "/*".to_string(), "*/".to_string()],
191 },
192 );
193
194 self.language_analyzers
196 .insert("ts".to_string(), self.language_analyzers["js"].clone());
197 }
198
199 fn initialize_semantic_patterns(&mut self) {
200 self.semantic_patterns.insert(
202 "data_validation".to_string(),
203 vec![
204 "validate".to_string(),
205 "check".to_string(),
206 "verify".to_string(),
207 "assert".to_string(),
208 "ensure".to_string(),
209 "require".to_string(),
210 ],
211 );
212
213 self.semantic_patterns.insert(
214 "error_handling".to_string(),
215 vec![
216 "try".to_string(),
217 "catch".to_string(),
218 "except".to_string(),
219 "error".to_string(),
220 "exception".to_string(),
221 "throw".to_string(),
222 "raise".to_string(),
223 "handle".to_string(),
224 ],
225 );
226
227 self.semantic_patterns.insert(
228 "database_operations".to_string(),
229 vec![
230 "select".to_string(),
231 "insert".to_string(),
232 "update".to_string(),
233 "delete".to_string(),
234 "query".to_string(),
235 "execute".to_string(),
236 "commit".to_string(),
237 "rollback".to_string(),
238 ],
239 );
240
241 self.semantic_patterns.insert(
242 "api_patterns".to_string(),
243 vec![
244 "get".to_string(),
245 "post".to_string(),
246 "put".to_string(),
247 "delete".to_string(),
248 "patch".to_string(),
249 "request".to_string(),
250 "response".to_string(),
251 "endpoint".to_string(),
252 "route".to_string(),
253 ],
254 );
255 }
256
257 pub fn find_code_duplicates_advanced(
259 &mut self,
260 repo_path: &Path,
261 similarity_threshold: f64,
262 min_lines: usize,
263 exclude_patterns: &[String],
264 ) -> Result<Vec<DuplicateResult>> {
265 let mut duplicates = Vec::new();
266 let mut file_contents = HashMap::new();
267 let mut file_asts = HashMap::new();
268
269 for entry in WalkDir::new(repo_path)
271 .into_iter()
272 .filter_map(|e| e.ok())
273 .filter(|e| e.file_type().is_file())
274 {
275 let path = entry.path();
276 if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
277 if [
278 "js", "ts", "py", "java", "rs", "c", "cpp", "go", "rb", "php",
279 ]
280 .contains(&ext)
281 {
282 let path_str = path.to_string_lossy();
283 if exclude_patterns
284 .iter()
285 .any(|pattern| path_str.contains(pattern))
286 {
287 continue;
288 }
289
290 if let Ok(content) = std::fs::read_to_string(path) {
291 let lines = content.lines().count();
292 if lines >= min_lines {
293 let ast = self.parse_file_ast(&content, ext)?;
295 file_asts.insert(path.to_path_buf(), ast);
296 file_contents.insert(path.to_path_buf(), content);
297 }
298 }
299 }
300 }
301 }
302
303 duplicates.extend(self.find_exact_duplicates(&file_contents, min_lines)?);
305 duplicates.extend(self.find_structural_duplicates(
306 &file_asts,
307 similarity_threshold,
308 min_lines,
309 )?);
310 duplicates.extend(self.find_semantic_duplicates(
311 &file_contents,
312 similarity_threshold,
313 min_lines,
314 )?);
315 duplicates.extend(self.find_pattern_duplicates(&file_contents, similarity_threshold)?);
316
317 let deduplicated = self.deduplicate_results(duplicates);
319 let enhanced_results = deduplicated
320 .into_iter()
321 .map(|dup| self.enhance_with_refactoring_suggestions(dup))
322 .collect();
323
324 Ok(enhanced_results)
325 }
326
327 fn parse_file_ast(&mut self, content: &str, language: &str) -> Result<AstNode> {
329 let lines: Vec<&str> = content.lines().collect();
331 let mut root_children = Vec::new();
332
333 for line in lines.iter() {
334 let trimmed = line.trim();
335 if trimmed.is_empty() || self.is_comment_line(trimmed, language) {
336 continue;
337 }
338
339 let node_type = self.classify_line_type(trimmed, language);
340 let normalized = self.normalize_line_for_ast(trimmed, language);
341 let hash = self.calculate_structural_hash(&normalized);
342
343 root_children.push(AstNode {
344 node_type: node_type.clone(),
345 children: Vec::new(),
346 normalized_text: normalized,
347 structural_hash: hash,
348 });
349 }
350
351 Ok(AstNode {
352 node_type: "file".to_string(),
353 children: root_children,
354 normalized_text: "".to_string(),
355 structural_hash: self.calculate_structural_hash(content),
356 })
357 }
358
359 fn is_comment_line(&self, line: &str, language: &str) -> bool {
360 if let Some(analyzer) = self.language_analyzers.get(language) {
361 analyzer
362 .comment_patterns
363 .iter()
364 .any(|pattern| line.starts_with(pattern))
365 } else {
366 line.starts_with("//") || line.starts_with("#") || line.starts_with("/*")
367 }
368 }
369
370 fn classify_line_type(&self, line: &str, language: &str) -> String {
371 if let Some(analyzer) = self.language_analyzers.get(language) {
372 for keyword in &analyzer.keywords {
373 if line.contains(keyword) {
374 return keyword.clone();
375 }
376 }
377 for control in &analyzer.control_structures {
378 if line.contains(control) {
379 return "control_structure".to_string();
380 }
381 }
382 }
383 "statement".to_string()
384 }
385
386 fn normalize_line_for_ast(&self, line: &str, language: &str) -> String {
387 let mut normalized = line.to_string();
388
389 if let Some(_analyzer) = self.language_analyzers.get(language) {
391 normalized = regex::Regex::new(r"\b[a-zA-Z_][a-zA-Z0-9_]*\b")
393 .unwrap()
394 .replace_all(&normalized, "IDENTIFIER")
395 .to_string();
396
397 normalized = regex::Regex::new(r"\b\d+\b")
399 .unwrap()
400 .replace_all(&normalized, "NUMBER")
401 .to_string();
402
403 normalized = regex::Regex::new(r#""[^"]*""#)
405 .unwrap()
406 .replace_all(&normalized, "STRING")
407 .to_string();
408 }
409
410 normalized
411 }
412
413 fn calculate_structural_hash(&self, content: &str) -> u64 {
414 use std::collections::hash_map::DefaultHasher;
415 use std::hash::{Hash, Hasher};
416
417 let mut hasher = DefaultHasher::new();
418 content.hash(&mut hasher);
419 hasher.finish()
420 }
421
422 fn find_exact_duplicates(
424 &self,
425 file_contents: &HashMap<std::path::PathBuf, String>,
426 min_lines: usize,
427 ) -> Result<Vec<DuplicateResult>> {
428 let mut duplicates = Vec::new();
429 let mut analyzed_pairs = HashSet::new();
430
431 for (file1, content1) in file_contents {
432 for (file2, content2) in file_contents {
433 if file1 >= file2 || analyzed_pairs.contains(&(file1.clone(), file2.clone())) {
434 continue;
435 }
436 analyzed_pairs.insert((file1.clone(), file2.clone()));
437
438 let similarity = self.calculate_exact_similarity(content1, content2);
439 if similarity >= 0.95 {
440 let lines1 = content1.lines().count();
442 let lines2 = content2.lines().count();
443
444 if lines1 >= min_lines && lines2 >= min_lines {
445 duplicates.push(DuplicateResult {
446 similarity_score: similarity,
447 duplicate_type: DuplicateType::ExactCopy,
448 files: vec![
449 DuplicateFile {
450 path: file1.display().to_string(),
451 lines: lines1,
452 start_line: None,
453 end_line: None,
454 complexity_score: self.calculate_complexity_score(content1),
455 },
456 DuplicateFile {
457 path: file2.display().to_string(),
458 lines: lines2,
459 start_line: None,
460 end_line: None,
461 complexity_score: self.calculate_complexity_score(content2),
462 },
463 ],
464 common_patterns: self.identify_common_patterns(content1, content2),
465 refactoring_suggestions: Vec::new(), confidence_level: 0.95,
467 estimated_savings: DuplicateSavings {
468 lines_of_code: lines1.min(lines2),
469 maintenance_effort: "High".to_string(),
470 bug_risk_reduction: "Significant".to_string(),
471 },
472 });
473 }
474 }
475 }
476 }
477
478 Ok(duplicates)
479 }
480
481 fn find_structural_duplicates(
483 &self,
484 file_asts: &HashMap<std::path::PathBuf, AstNode>,
485 similarity_threshold: f64,
486 min_lines: usize,
487 ) -> Result<Vec<DuplicateResult>> {
488 let mut duplicates = Vec::new();
489 let mut analyzed_pairs = HashSet::new();
490
491 for (file1, ast1) in file_asts {
492 for (file2, ast2) in file_asts {
493 if file1 >= file2 || analyzed_pairs.contains(&(file1.clone(), file2.clone())) {
494 continue;
495 }
496 analyzed_pairs.insert((file1.clone(), file2.clone()));
497
498 let similarity = self.calculate_structural_similarity_ast(ast1, ast2);
499 if similarity >= similarity_threshold
500 && ast1.children.len() >= min_lines
501 && ast2.children.len() >= min_lines
502 {
503 duplicates.push(DuplicateResult {
504 similarity_score: similarity,
505 duplicate_type: DuplicateType::StructuralSimilar,
506 files: vec![
507 DuplicateFile {
508 path: file1.display().to_string(),
509 lines: ast1.children.len(),
510 start_line: None,
511 end_line: None,
512 complexity_score: self.calculate_ast_complexity(ast1),
513 },
514 DuplicateFile {
515 path: file2.display().to_string(),
516 lines: ast2.children.len(),
517 start_line: None,
518 end_line: None,
519 complexity_score: self.calculate_ast_complexity(ast2),
520 },
521 ],
522 common_patterns: self.identify_structural_patterns(ast1, ast2),
523 refactoring_suggestions: Vec::new(),
524 confidence_level: similarity * 0.9, estimated_savings: DuplicateSavings {
526 lines_of_code: ast1.children.len().min(ast2.children.len()),
527 maintenance_effort: "Medium".to_string(),
528 bug_risk_reduction: "Moderate".to_string(),
529 },
530 });
531 }
532 }
533 }
534
535 Ok(duplicates)
536 }
537
538 fn find_semantic_duplicates(
540 &self,
541 file_contents: &HashMap<std::path::PathBuf, String>,
542 similarity_threshold: f64,
543 min_lines: usize,
544 ) -> Result<Vec<DuplicateResult>> {
545 let mut duplicates = Vec::new();
546 let mut analyzed_pairs = HashSet::new();
547
548 for (file1, content1) in file_contents {
549 for (file2, content2) in file_contents {
550 if file1 >= file2 || analyzed_pairs.contains(&(file1.clone(), file2.clone())) {
551 continue;
552 }
553 analyzed_pairs.insert((file1.clone(), file2.clone()));
554
555 let similarity = self.calculate_semantic_similarity(content1, content2);
556 if similarity >= similarity_threshold {
557 let lines1 = content1.lines().count();
558 let lines2 = content2.lines().count();
559
560 if lines1 >= min_lines && lines2 >= min_lines {
561 duplicates.push(DuplicateResult {
562 similarity_score: similarity,
563 duplicate_type: DuplicateType::SemanticSimilar,
564 files: vec![
565 DuplicateFile {
566 path: file1.display().to_string(),
567 lines: lines1,
568 start_line: None,
569 end_line: None,
570 complexity_score: self.calculate_complexity_score(content1),
571 },
572 DuplicateFile {
573 path: file2.display().to_string(),
574 lines: lines2,
575 start_line: None,
576 end_line: None,
577 complexity_score: self.calculate_complexity_score(content2),
578 },
579 ],
580 common_patterns: self.identify_semantic_patterns(content1, content2),
581 refactoring_suggestions: Vec::new(),
582 confidence_level: similarity * 0.8, estimated_savings: DuplicateSavings {
584 lines_of_code: lines1.min(lines2) / 2, maintenance_effort: "Medium".to_string(),
586 bug_risk_reduction: "Low".to_string(),
587 },
588 });
589 }
590 }
591 }
592 }
593
594 Ok(duplicates)
595 }
596
597 fn find_pattern_duplicates(
599 &self,
600 _file_contents: &HashMap<std::path::PathBuf, String>,
601 _similarity_threshold: f64,
602 ) -> Result<Vec<DuplicateResult>> {
603 let duplicates = Vec::new();
604 Ok(duplicates)
607 }
608
609 fn calculate_exact_similarity(&self, content1: &str, content2: &str) -> f64 {
610 self.calculate_content_similarity(content1, content2)
611 }
612
613 fn calculate_structural_similarity_ast(&self, ast1: &AstNode, ast2: &AstNode) -> f64 {
614 if ast1.children.is_empty() && ast2.children.is_empty() {
615 return if ast1.normalized_text == ast2.normalized_text {
616 1.0
617 } else {
618 0.0
619 };
620 }
621
622 let mut matches = 0;
623 let total = ast1.children.len().max(ast2.children.len());
624
625 for child1 in &ast1.children {
626 for child2 in &ast2.children {
627 if child1.node_type == child2.node_type
628 && child1.structural_hash == child2.structural_hash
629 {
630 matches += 1;
631 break;
632 }
633 }
634 }
635
636 if total == 0 {
637 0.0
638 } else {
639 matches as f64 / total as f64
640 }
641 }
642
643 fn calculate_semantic_similarity(&self, content1: &str, content2: &str) -> f64 {
644 let mut similarity_score = 0.0;
645 let mut _pattern_matches = 0;
646 let mut total_patterns = 0;
647
648 for patterns in self.semantic_patterns.values() {
649 total_patterns += patterns.len();
650
651 let count1 = patterns
652 .iter()
653 .map(|p| content1.matches(p).count())
654 .sum::<usize>();
655 let count2 = patterns
656 .iter()
657 .map(|p| content2.matches(p).count())
658 .sum::<usize>();
659
660 if count1 > 0 && count2 > 0 {
661 _pattern_matches += patterns.len();
662 similarity_score += (count1.min(count2) as f64) / (count1.max(count2) as f64);
663 }
664 }
665
666 if total_patterns == 0 {
667 0.0
668 } else {
669 similarity_score / total_patterns as f64
670 }
671 }
672
673 pub fn find_code_duplicates(
675 &mut self,
676 repo_path: &Path,
677 similarity_threshold: f64,
678 min_lines: usize,
679 exclude_patterns: &[String],
680 ) -> Result<Vec<Value>> {
681 let advanced_results = self.find_code_duplicates_advanced(
682 repo_path,
683 similarity_threshold,
684 min_lines,
685 exclude_patterns,
686 )?;
687
688 let legacy_results = advanced_results
690 .into_iter()
691 .map(|result| {
692 serde_json::json!({
693 "similarity": result.similarity_score,
694 "files": result.files.iter().map(|f| serde_json::json!({
695 "path": f.path,
696 "lines": f.lines
697 })).collect::<Vec<_>>(),
698 "lines": result.files.iter().map(|f| f.lines).min().unwrap_or(0),
699 "type": match result.duplicate_type {
700 DuplicateType::ExactCopy => "exact_copy",
701 DuplicateType::StructuralSimilar => "structural_similar",
702 DuplicateType::SemanticSimilar => "semantic_similar",
703 DuplicateType::PatternDuplicate => "pattern_duplicate",
704 },
705 "confidence_level": result.confidence_level,
706 "refactoring_suggestions": result.refactoring_suggestions.len()
707 })
708 })
709 .collect();
710
711 Ok(legacy_results)
712 }
713
714 fn calculate_complexity_score(&self, content: &str) -> f64 {
715 let lines = content.lines().count();
716 let non_empty_lines = content
717 .lines()
718 .filter(|line| !line.trim().is_empty())
719 .count();
720
721 let control_structures = content.matches("if").count()
723 + content.matches("for").count()
724 + content.matches("while").count()
725 + content.matches("try").count()
726 + content.matches("catch").count()
727 + content.matches("switch").count();
728 let functions = content.matches("def ").count()
729 + content.matches("function ").count()
730 + content.matches("class ").count();
731
732 let base_complexity = non_empty_lines as f64 / lines.max(1) as f64;
733 let control_complexity = control_structures as f64 / non_empty_lines.max(1) as f64;
734 let function_complexity = functions as f64 / non_empty_lines.max(1) as f64;
735
736 (base_complexity + control_complexity + function_complexity) * 100.0
737 }
738
739 fn calculate_ast_complexity(&self, ast: &AstNode) -> f64 {
740 let mut complexity = 0.0;
741 complexity += ast.children.len() as f64;
742
743 for child in &ast.children {
744 if child.node_type.contains("if")
745 || child.node_type.contains("for")
746 || child.node_type.contains("while")
747 {
748 complexity += 2.0; } else {
750 complexity += 1.0;
751 }
752 }
753
754 complexity
755 }
756
757 fn identify_common_patterns(&self, content1: &str, content2: &str) -> Vec<String> {
758 let mut patterns = Vec::new();
759
760 for (pattern_type, pattern_keywords) in &self.semantic_patterns {
762 let matches1 = pattern_keywords
763 .iter()
764 .filter(|&keyword| content1.contains(keyword))
765 .count();
766 let matches2 = pattern_keywords
767 .iter()
768 .filter(|&keyword| content2.contains(keyword))
769 .count();
770
771 if matches1 > 0 && matches2 > 0 {
772 patterns.push(pattern_type.clone());
773 }
774 }
775
776 let common_keywords = ["function", "class", "if", "for", "while", "try", "catch"];
778 for keyword in &common_keywords {
779 if content1.contains(keyword) && content2.contains(keyword) {
780 patterns.push(keyword.to_string());
781 }
782 }
783
784 patterns
785 }
786
787 fn identify_structural_patterns(&self, ast1: &AstNode, ast2: &AstNode) -> Vec<String> {
788 let mut patterns = Vec::new();
789
790 let types1: HashSet<_> = ast1.children.iter().map(|child| &child.node_type).collect();
792 let types2: HashSet<_> = ast2.children.iter().map(|child| &child.node_type).collect();
793
794 for common_type in types1.intersection(&types2) {
795 patterns.push(format!("structural_{common_type}"));
796 }
797
798 patterns
799 }
800
801 fn identify_semantic_patterns(&self, content1: &str, content2: &str) -> Vec<String> {
802 let mut patterns = Vec::new();
803
804 for (pattern_type, keywords) in &self.semantic_patterns {
805 let score1 = keywords
806 .iter()
807 .map(|k| content1.matches(k).count())
808 .sum::<usize>();
809 let score2 = keywords
810 .iter()
811 .map(|k| content2.matches(k).count())
812 .sum::<usize>();
813
814 if score1 > 0 && score2 > 0 {
815 let similarity = (score1.min(score2) as f64) / (score1.max(score2) as f64);
816 if similarity > 0.5 {
817 patterns.push(format!("semantic_{pattern_type}"));
818 }
819 }
820 }
821
822 patterns
823 }
824
825 fn deduplicate_results(&self, mut duplicates: Vec<DuplicateResult>) -> Vec<DuplicateResult> {
826 duplicates.sort_by(|a, b| b.confidence_level.partial_cmp(&a.confidence_level).unwrap());
828
829 let mut result = Vec::new();
830 let mut seen_files = HashSet::new();
831
832 for duplicate in duplicates {
833 let file_paths: Vec<String> = duplicate.files.iter().map(|f| f.path.clone()).collect();
834
835 if !file_paths.iter().any(|path| seen_files.contains(path)) {
836 for path in &file_paths {
837 seen_files.insert(path.clone());
838 }
839 result.push(duplicate);
840 }
841 }
842
843 result
844 }
845
846 fn enhance_with_refactoring_suggestions(
847 &self,
848 mut duplicate: DuplicateResult,
849 ) -> DuplicateResult {
850 let mut suggestions = Vec::new();
851
852 match duplicate.duplicate_type {
853 DuplicateType::ExactCopy => {
854 suggestions.push(RefactoringSuggestion {
855 suggestion_type: "Extract Common Function".to_string(),
856 description: "Extract the duplicated code into a common function or module"
857 .to_string(),
858 estimated_effort: "Low".to_string(),
859 potential_savings: format!(
860 "{} lines of code",
861 duplicate.estimated_savings.lines_of_code
862 ),
863 implementation_steps: vec![
864 "1. Create a new function with the common code".to_string(),
865 "2. Replace duplicate instances with function calls".to_string(),
866 "3. Test to ensure functionality is preserved".to_string(),
867 ],
868 });
869 }
870 DuplicateType::StructuralSimilar => {
871 suggestions.push(RefactoringSuggestion {
872 suggestion_type: "Create Abstract Base Class".to_string(),
873 description: "Create a common base class or interface for similar structures"
874 .to_string(),
875 estimated_effort: "Medium".to_string(),
876 potential_savings: format!(
877 "{} lines of code reduction",
878 duplicate.estimated_savings.lines_of_code / 2
879 ),
880 implementation_steps: vec![
881 "1. Identify common structural elements".to_string(),
882 "2. Create base class or interface".to_string(),
883 "3. Refactor duplicate classes to inherit/implement".to_string(),
884 "4. Test inheritance hierarchy".to_string(),
885 ],
886 });
887 }
888 DuplicateType::SemanticSimilar => {
889 suggestions.push(RefactoringSuggestion {
890 suggestion_type: "Strategy Pattern".to_string(),
891 description: "Use strategy pattern to handle similar functionality".to_string(),
892 estimated_effort: "High".to_string(),
893 potential_savings: "Improved maintainability and reduced complexity"
894 .to_string(),
895 implementation_steps: vec![
896 "1. Define common interface for similar behaviors".to_string(),
897 "2. Implement concrete strategies".to_string(),
898 "3. Refactor to use strategy pattern".to_string(),
899 "4. Add configuration for strategy selection".to_string(),
900 ],
901 });
902 }
903 DuplicateType::PatternDuplicate => {
904 suggestions.push(RefactoringSuggestion {
905 suggestion_type: "Template Method".to_string(),
906 description: "Use template method pattern for common algorithmic structure"
907 .to_string(),
908 estimated_effort: "Medium".to_string(),
909 potential_savings: "Reduced code duplication and improved consistency"
910 .to_string(),
911 implementation_steps: vec![
912 "1. Identify common algorithm structure".to_string(),
913 "2. Create template method in base class".to_string(),
914 "3. Implement varying steps in subclasses".to_string(),
915 ],
916 });
917 }
918 }
919
920 let avg_complexity = duplicate
922 .files
923 .iter()
924 .map(|f| f.complexity_score)
925 .sum::<f64>()
926 / duplicate.files.len() as f64;
927
928 if avg_complexity > 50.0 {
929 suggestions.push(RefactoringSuggestion {
930 suggestion_type: "Simplify Complex Code".to_string(),
931 description:
932 "Break down complex duplicated code into smaller, more manageable pieces"
933 .to_string(),
934 estimated_effort: "High".to_string(),
935 potential_savings: "Improved readability and maintainability".to_string(),
936 implementation_steps: vec![
937 "1. Identify complex sections within duplicates".to_string(),
938 "2. Extract helper functions".to_string(),
939 "3. Simplify conditional logic".to_string(),
940 "4. Add comprehensive tests".to_string(),
941 ],
942 });
943 }
944
945 duplicate.refactoring_suggestions = suggestions;
946 duplicate
947 }
948
949 pub fn calculate_content_similarity(&self, content1: &str, content2: &str) -> f64 {
951 let lines1: Vec<String> = content1
952 .lines()
953 .map(|s| s.trim().to_string())
954 .filter(|s| !s.is_empty() && !s.starts_with("//") && !s.starts_with("#"))
955 .collect();
956
957 let lines2: Vec<String> = content2
958 .lines()
959 .map(|s| s.trim().to_string())
960 .filter(|s| !s.is_empty() && !s.starts_with("//") && !s.starts_with("#"))
961 .collect();
962
963 if lines1.is_empty() || lines2.is_empty() {
964 return 0.0;
965 }
966
967 let set1: HashSet<String> = lines1.into_iter().collect();
969 let set2: HashSet<String> = lines2.into_iter().collect();
970
971 if set1.is_empty() && set2.is_empty() {
972 return 1.0;
973 }
974
975 let intersection = set1.intersection(&set2).count();
976 let union = set1.union(&set2).count();
977
978 if union == 0 {
979 0.0
980 } else {
981 intersection as f64 / union as f64
982 }
983 }
984
985 pub fn find_duplicate_blocks(
987 &self,
988 content: &str,
989 min_lines: usize,
990 similarity_threshold: f64,
991 ) -> Result<Vec<Value>> {
992 let mut duplicates = Vec::new();
993 let lines: Vec<&str> = content.lines().collect();
994
995 if lines.len() < min_lines * 2 {
996 return Ok(duplicates);
997 }
998
999 for i in 0..=lines.len().saturating_sub(min_lines) {
1001 for j in (i + min_lines)..=lines.len().saturating_sub(min_lines) {
1002 let block1 = &lines[i..i + min_lines];
1003 let block2 = &lines[j..j + min_lines];
1004
1005 let block1_text = block1.join("\n");
1006 let block2_text = block2.join("\n");
1007
1008 let similarity = self.calculate_content_similarity(&block1_text, &block2_text);
1009
1010 if similarity >= similarity_threshold {
1011 duplicates.push(serde_json::json!({
1012 "similarity": similarity,
1013 "blocks": [
1014 {
1015 "start_line": i + 1,
1016 "end_line": i + min_lines,
1017 "content": block1_text
1018 },
1019 {
1020 "start_line": j + 1,
1021 "end_line": j + min_lines,
1022 "content": block2_text
1023 }
1024 ],
1025 "type": "block_similarity"
1026 }));
1027 }
1028 }
1029 }
1030
1031 Ok(duplicates)
1032 }
1033
1034 pub fn calculate_structural_similarity(&self, content1: &str, content2: &str) -> f64 {
1036 let normalized1 = self.normalize_for_structure(content1);
1038 let normalized2 = self.normalize_for_structure(content2);
1039
1040 self.calculate_content_similarity(&normalized1, &normalized2)
1041 }
1042
1043 fn normalize_for_structure(&self, content: &str) -> String {
1045 content
1046 .lines()
1047 .map(|line| {
1048 let trimmed = line.trim();
1049 let mut normalized = trimmed.to_string();
1051
1052 normalized = regex::Regex::new(r"\b[a-zA-Z_][a-zA-Z0-9_]*\b")
1054 .unwrap()
1055 .replace_all(&normalized, "IDENTIFIER")
1056 .to_string();
1057
1058 normalized = regex::Regex::new(r"\b\d+\b")
1059 .unwrap()
1060 .replace_all(&normalized, "NUMBER")
1061 .to_string();
1062
1063 normalized = regex::Regex::new(r#""[^"]*""#)
1064 .unwrap()
1065 .replace_all(&normalized, "STRING")
1066 .to_string();
1067
1068 normalized
1069 })
1070 .collect::<Vec<_>>()
1071 .join("\n")
1072 }
1073}
1074
1075impl Default for DuplicateAnalyzer {
1076 fn default() -> Self {
1077 Self::new()
1078 }
1079}
1080
1081#[cfg(test)]
1082mod tests {
1083 use super::*;
1084 use std::fs;
1085 use tempfile::tempdir;
1086
1087 #[test]
1088 fn test_content_similarity() {
1089 let analyzer = DuplicateAnalyzer::new();
1090
1091 let content1 = "line1\nline2\nline3";
1092 let content2 = "line1\nline2\nline4";
1093
1094 let similarity = analyzer.calculate_content_similarity(content1, content2);
1095 assert!(similarity > 0.0 && similarity < 1.0);
1096
1097 let identical = analyzer.calculate_content_similarity(content1, content1);
1098 assert_eq!(identical, 1.0);
1099 }
1100
1101 #[test]
1102 fn test_structural_similarity() {
1103 let analyzer = DuplicateAnalyzer::new();
1104
1105 let content1 = "def func1(x, y):\n return x + y";
1106 let content2 = "def func2(a, b):\n return a + b";
1107
1108 let similarity = analyzer.calculate_structural_similarity(content1, content2);
1109 assert!(similarity > 0.8); }
1111
1112 #[test]
1113 fn test_find_duplicate_blocks() {
1114 let analyzer = DuplicateAnalyzer::new();
1115
1116 let content = "line1\nline2\nline3\nline4\nline1\nline2\nline3\nline5";
1117 let duplicates = analyzer.find_duplicate_blocks(content, 3, 0.8).unwrap();
1118
1119 assert!(!duplicates.is_empty(), "Should find duplicate code");
1120 }
1121
1122 #[test]
1123 fn test_find_code_duplicates() {
1124 let mut analyzer = DuplicateAnalyzer::new();
1125 let temp_dir = tempdir().unwrap();
1126
1127 let file1_path = temp_dir.path().join("file1.py");
1129 let file2_path = temp_dir.path().join("file2.py");
1130
1131 fs::write(&file1_path, "def test():\n return 1").unwrap();
1132 fs::write(&file2_path, "def test():\n return 1").unwrap();
1133
1134 let duplicates = analyzer
1135 .find_code_duplicates(temp_dir.path(), 0.8, 1, &[])
1136 .unwrap();
1137 assert!(!duplicates.is_empty(), "Should find duplicate code");
1138 }
1139}