codeprism_analysis/
duplicates.rs

1//! Code duplicate detection module with AST-based analysis and semantic understanding
2
3use anyhow::Result;
4use serde_json::Value;
5use std::collections::{HashMap, HashSet};
6use std::path::Path;
7use walkdir::WalkDir;
8
9/// AST node structure for comparison
10#[derive(Debug, Clone, PartialEq)]
11pub struct AstNode {
12    pub node_type: String,
13    pub children: Vec<AstNode>,
14    pub normalized_text: String,
15    pub structural_hash: u64,
16}
17
18/// Duplicate type classification
19#[derive(Debug, Clone, PartialEq)]
20pub enum DuplicateType {
21    ExactCopy,
22    StructuralSimilar,
23    SemanticSimilar,
24    PatternDuplicate,
25}
26
27/// Refactoring suggestion for duplicates
28#[derive(Debug, Clone)]
29pub struct RefactoringSuggestion {
30    pub suggestion_type: String,
31    pub description: String,
32    pub estimated_effort: String,
33    pub potential_savings: String,
34    pub implementation_steps: Vec<String>,
35}
36
37/// Enhanced duplicate result with detailed analysis
38#[derive(Debug, Clone)]
39pub struct DuplicateResult {
40    pub similarity_score: f64,
41    pub duplicate_type: DuplicateType,
42    pub files: Vec<DuplicateFile>,
43    pub common_patterns: Vec<String>,
44    pub refactoring_suggestions: Vec<RefactoringSuggestion>,
45    pub confidence_level: f64,
46    pub estimated_savings: DuplicateSavings,
47}
48
49#[derive(Debug, Clone)]
50pub struct DuplicateFile {
51    pub path: String,
52    pub lines: usize,
53    pub start_line: Option<usize>,
54    pub end_line: Option<usize>,
55    pub complexity_score: f64,
56}
57
58#[derive(Debug, Clone)]
59pub struct DuplicateSavings {
60    pub lines_of_code: usize,
61    pub maintenance_effort: String,
62    pub bug_risk_reduction: String,
63}
64
65/// Advanced duplicate analyzer with AST and semantic analysis
66pub struct DuplicateAnalyzer {
67    /// Cache for parsed AST structures
68    #[allow(dead_code)] // Will be used for AST caching optimization
69    ast_cache: HashMap<String, AstNode>,
70    /// Language-specific analyzers for different programming languages
71    language_analyzers: HashMap<String, LanguageAnalyzer>,
72    /// Semantic patterns for identifying functional similarities
73    semantic_patterns: HashMap<String, Vec<String>>,
74}
75
76#[derive(Debug, Clone)]
77struct LanguageAnalyzer {
78    keywords: Vec<String>,
79    #[allow(dead_code)] // Will be used for operator-aware analysis
80    operators: Vec<String>,
81    control_structures: Vec<String>,
82    comment_patterns: Vec<String>,
83}
84
85impl DuplicateAnalyzer {
86    pub fn new() -> Self {
87        let mut analyzer = Self {
88            ast_cache: HashMap::new(),
89            semantic_patterns: HashMap::new(),
90            language_analyzers: HashMap::new(),
91        };
92        analyzer.initialize_language_analyzers();
93        analyzer.initialize_semantic_patterns();
94        analyzer
95    }
96
97    fn initialize_language_analyzers(&mut self) {
98        // Python analyzer
99        self.language_analyzers.insert(
100            "py".to_string(),
101            LanguageAnalyzer {
102                keywords: vec![
103                    "def".to_string(),
104                    "class".to_string(),
105                    "if".to_string(),
106                    "for".to_string(),
107                    "while".to_string(),
108                    "try".to_string(),
109                    "except".to_string(),
110                    "with".to_string(),
111                    "import".to_string(),
112                    "from".to_string(),
113                    "return".to_string(),
114                    "yield".to_string(),
115                ],
116                operators: vec![
117                    "+".to_string(),
118                    "-".to_string(),
119                    "*".to_string(),
120                    "/".to_string(),
121                    "==".to_string(),
122                    "!=".to_string(),
123                    ">=".to_string(),
124                    "<=".to_string(),
125                    "and".to_string(),
126                    "or".to_string(),
127                    "not".to_string(),
128                    "in".to_string(),
129                ],
130                control_structures: vec![
131                    "if".to_string(),
132                    "elif".to_string(),
133                    "else".to_string(),
134                    "for".to_string(),
135                    "while".to_string(),
136                    "try".to_string(),
137                    "except".to_string(),
138                    "finally".to_string(),
139                ],
140                comment_patterns: vec!["#".to_string(), "\"\"\"".to_string(), "'''".to_string()],
141            },
142        );
143
144        // JavaScript/TypeScript analyzer
145        self.language_analyzers.insert(
146            "js".to_string(),
147            LanguageAnalyzer {
148                keywords: vec![
149                    "function".to_string(),
150                    "class".to_string(),
151                    "if".to_string(),
152                    "for".to_string(),
153                    "while".to_string(),
154                    "try".to_string(),
155                    "catch".to_string(),
156                    "const".to_string(),
157                    "let".to_string(),
158                    "var".to_string(),
159                    "return".to_string(),
160                    "async".to_string(),
161                    "await".to_string(),
162                    "import".to_string(),
163                    "export".to_string(),
164                ],
165                operators: vec![
166                    "+".to_string(),
167                    "-".to_string(),
168                    "*".to_string(),
169                    "/".to_string(),
170                    "==".to_string(),
171                    "===".to_string(),
172                    "!=".to_string(),
173                    "!==".to_string(),
174                    "&&".to_string(),
175                    "||".to_string(),
176                    "!".to_string(),
177                ],
178                control_structures: vec![
179                    "if".to_string(),
180                    "else".to_string(),
181                    "switch".to_string(),
182                    "case".to_string(),
183                    "for".to_string(),
184                    "while".to_string(),
185                    "do".to_string(),
186                    "try".to_string(),
187                    "catch".to_string(),
188                    "finally".to_string(),
189                ],
190                comment_patterns: vec!["//".to_string(), "/*".to_string(), "*/".to_string()],
191            },
192        );
193
194        // Copy JS analyzer for TS files
195        self.language_analyzers
196            .insert("ts".to_string(), self.language_analyzers["js"].clone());
197    }
198
199    fn initialize_semantic_patterns(&mut self) {
200        // Common programming patterns
201        self.semantic_patterns.insert(
202            "data_validation".to_string(),
203            vec![
204                "validate".to_string(),
205                "check".to_string(),
206                "verify".to_string(),
207                "assert".to_string(),
208                "ensure".to_string(),
209                "require".to_string(),
210            ],
211        );
212
213        self.semantic_patterns.insert(
214            "error_handling".to_string(),
215            vec![
216                "try".to_string(),
217                "catch".to_string(),
218                "except".to_string(),
219                "error".to_string(),
220                "exception".to_string(),
221                "throw".to_string(),
222                "raise".to_string(),
223                "handle".to_string(),
224            ],
225        );
226
227        self.semantic_patterns.insert(
228            "database_operations".to_string(),
229            vec![
230                "select".to_string(),
231                "insert".to_string(),
232                "update".to_string(),
233                "delete".to_string(),
234                "query".to_string(),
235                "execute".to_string(),
236                "commit".to_string(),
237                "rollback".to_string(),
238            ],
239        );
240
241        self.semantic_patterns.insert(
242            "api_patterns".to_string(),
243            vec![
244                "get".to_string(),
245                "post".to_string(),
246                "put".to_string(),
247                "delete".to_string(),
248                "patch".to_string(),
249                "request".to_string(),
250                "response".to_string(),
251                "endpoint".to_string(),
252                "route".to_string(),
253            ],
254        );
255    }
256
257    /// Find code duplicates with advanced AST and semantic analysis
258    pub fn find_code_duplicates_advanced(
259        &mut self,
260        repo_path: &Path,
261        similarity_threshold: f64,
262        min_lines: usize,
263        exclude_patterns: &[String],
264    ) -> Result<Vec<DuplicateResult>> {
265        let mut duplicates = Vec::new();
266        let mut file_contents = HashMap::new();
267        let mut file_asts = HashMap::new();
268
269        // Collect all source files and parse them
270        for entry in WalkDir::new(repo_path)
271            .into_iter()
272            .filter_map(|e| e.ok())
273            .filter(|e| e.file_type().is_file())
274        {
275            let path = entry.path();
276            if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
277                if [
278                    "js", "ts", "py", "java", "rs", "c", "cpp", "go", "rb", "php",
279                ]
280                .contains(&ext)
281                {
282                    let path_str = path.to_string_lossy();
283                    if exclude_patterns
284                        .iter()
285                        .any(|pattern| path_str.contains(pattern))
286                    {
287                        continue;
288                    }
289
290                    if let Ok(content) = std::fs::read_to_string(path) {
291                        let lines = content.lines().count();
292                        if lines >= min_lines {
293                            // Parse AST for the file
294                            let ast = self.parse_file_ast(&content, ext)?;
295                            file_asts.insert(path.to_path_buf(), ast);
296                            file_contents.insert(path.to_path_buf(), content);
297                        }
298                    }
299                }
300            }
301        }
302
303        // Advanced duplicate detection using multiple techniques
304        duplicates.extend(self.find_exact_duplicates(&file_contents, min_lines)?);
305        duplicates.extend(self.find_structural_duplicates(
306            &file_asts,
307            similarity_threshold,
308            min_lines,
309        )?);
310        duplicates.extend(self.find_semantic_duplicates(
311            &file_contents,
312            similarity_threshold,
313            min_lines,
314        )?);
315        duplicates.extend(self.find_pattern_duplicates(&file_contents, similarity_threshold)?);
316
317        // Remove overlapping duplicates and enhance with refactoring suggestions
318        let deduplicated = self.deduplicate_results(duplicates);
319        let enhanced_results = deduplicated
320            .into_iter()
321            .map(|dup| self.enhance_with_refactoring_suggestions(dup))
322            .collect();
323
324        Ok(enhanced_results)
325    }
326
327    /// Parse file content into AST representation
328    fn parse_file_ast(&mut self, content: &str, language: &str) -> Result<AstNode> {
329        // Simple AST parsing - in production this would use tree-sitter or similar
330        let lines: Vec<&str> = content.lines().collect();
331        let mut root_children = Vec::new();
332
333        for line in lines.iter() {
334            let trimmed = line.trim();
335            if trimmed.is_empty() || self.is_comment_line(trimmed, language) {
336                continue;
337            }
338
339            let node_type = self.classify_line_type(trimmed, language);
340            let normalized = self.normalize_line_for_ast(trimmed, language);
341            let hash = self.calculate_structural_hash(&normalized);
342
343            root_children.push(AstNode {
344                node_type: node_type.clone(),
345                children: Vec::new(),
346                normalized_text: normalized,
347                structural_hash: hash,
348            });
349        }
350
351        Ok(AstNode {
352            node_type: "file".to_string(),
353            children: root_children,
354            normalized_text: "".to_string(),
355            structural_hash: self.calculate_structural_hash(content),
356        })
357    }
358
359    fn is_comment_line(&self, line: &str, language: &str) -> bool {
360        if let Some(analyzer) = self.language_analyzers.get(language) {
361            analyzer
362                .comment_patterns
363                .iter()
364                .any(|pattern| line.starts_with(pattern))
365        } else {
366            line.starts_with("//") || line.starts_with("#") || line.starts_with("/*")
367        }
368    }
369
370    fn classify_line_type(&self, line: &str, language: &str) -> String {
371        if let Some(analyzer) = self.language_analyzers.get(language) {
372            for keyword in &analyzer.keywords {
373                if line.contains(keyword) {
374                    return keyword.clone();
375                }
376            }
377            for control in &analyzer.control_structures {
378                if line.contains(control) {
379                    return "control_structure".to_string();
380                }
381            }
382        }
383        "statement".to_string()
384    }
385
386    fn normalize_line_for_ast(&self, line: &str, language: &str) -> String {
387        let mut normalized = line.to_string();
388
389        // Remove language-specific syntax while preserving structure
390        if let Some(_analyzer) = self.language_analyzers.get(language) {
391            // Replace identifiers with normalized tokens
392            normalized = regex::Regex::new(r"\b[a-zA-Z_][a-zA-Z0-9_]*\b")
393                .unwrap()
394                .replace_all(&normalized, "IDENTIFIER")
395                .to_string();
396
397            // Replace numbers with normalized tokens
398            normalized = regex::Regex::new(r"\b\d+\b")
399                .unwrap()
400                .replace_all(&normalized, "NUMBER")
401                .to_string();
402
403            // Replace strings with normalized tokens
404            normalized = regex::Regex::new(r#""[^"]*""#)
405                .unwrap()
406                .replace_all(&normalized, "STRING")
407                .to_string();
408        }
409
410        normalized
411    }
412
413    fn calculate_structural_hash(&self, content: &str) -> u64 {
414        use std::collections::hash_map::DefaultHasher;
415        use std::hash::{Hash, Hasher};
416
417        let mut hasher = DefaultHasher::new();
418        content.hash(&mut hasher);
419        hasher.finish()
420    }
421
422    /// Find exact duplicates (copy-paste)
423    fn find_exact_duplicates(
424        &self,
425        file_contents: &HashMap<std::path::PathBuf, String>,
426        min_lines: usize,
427    ) -> Result<Vec<DuplicateResult>> {
428        let mut duplicates = Vec::new();
429        let mut analyzed_pairs = HashSet::new();
430
431        for (file1, content1) in file_contents {
432            for (file2, content2) in file_contents {
433                if file1 >= file2 || analyzed_pairs.contains(&(file1.clone(), file2.clone())) {
434                    continue;
435                }
436                analyzed_pairs.insert((file1.clone(), file2.clone()));
437
438                let similarity = self.calculate_exact_similarity(content1, content2);
439                if similarity >= 0.95 {
440                    // Very high threshold for exact duplicates
441                    let lines1 = content1.lines().count();
442                    let lines2 = content2.lines().count();
443
444                    if lines1 >= min_lines && lines2 >= min_lines {
445                        duplicates.push(DuplicateResult {
446                            similarity_score: similarity,
447                            duplicate_type: DuplicateType::ExactCopy,
448                            files: vec![
449                                DuplicateFile {
450                                    path: file1.display().to_string(),
451                                    lines: lines1,
452                                    start_line: None,
453                                    end_line: None,
454                                    complexity_score: self.calculate_complexity_score(content1),
455                                },
456                                DuplicateFile {
457                                    path: file2.display().to_string(),
458                                    lines: lines2,
459                                    start_line: None,
460                                    end_line: None,
461                                    complexity_score: self.calculate_complexity_score(content2),
462                                },
463                            ],
464                            common_patterns: self.identify_common_patterns(content1, content2),
465                            refactoring_suggestions: Vec::new(), // Will be filled later
466                            confidence_level: 0.95,
467                            estimated_savings: DuplicateSavings {
468                                lines_of_code: lines1.min(lines2),
469                                maintenance_effort: "High".to_string(),
470                                bug_risk_reduction: "Significant".to_string(),
471                            },
472                        });
473                    }
474                }
475            }
476        }
477
478        Ok(duplicates)
479    }
480
481    /// Find structural duplicates using AST comparison
482    fn find_structural_duplicates(
483        &self,
484        file_asts: &HashMap<std::path::PathBuf, AstNode>,
485        similarity_threshold: f64,
486        min_lines: usize,
487    ) -> Result<Vec<DuplicateResult>> {
488        let mut duplicates = Vec::new();
489        let mut analyzed_pairs = HashSet::new();
490
491        for (file1, ast1) in file_asts {
492            for (file2, ast2) in file_asts {
493                if file1 >= file2 || analyzed_pairs.contains(&(file1.clone(), file2.clone())) {
494                    continue;
495                }
496                analyzed_pairs.insert((file1.clone(), file2.clone()));
497
498                let similarity = self.calculate_structural_similarity_ast(ast1, ast2);
499                if similarity >= similarity_threshold
500                    && ast1.children.len() >= min_lines
501                    && ast2.children.len() >= min_lines
502                {
503                    duplicates.push(DuplicateResult {
504                        similarity_score: similarity,
505                        duplicate_type: DuplicateType::StructuralSimilar,
506                        files: vec![
507                            DuplicateFile {
508                                path: file1.display().to_string(),
509                                lines: ast1.children.len(),
510                                start_line: None,
511                                end_line: None,
512                                complexity_score: self.calculate_ast_complexity(ast1),
513                            },
514                            DuplicateFile {
515                                path: file2.display().to_string(),
516                                lines: ast2.children.len(),
517                                start_line: None,
518                                end_line: None,
519                                complexity_score: self.calculate_ast_complexity(ast2),
520                            },
521                        ],
522                        common_patterns: self.identify_structural_patterns(ast1, ast2),
523                        refactoring_suggestions: Vec::new(),
524                        confidence_level: similarity * 0.9, // Slightly lower confidence for structural
525                        estimated_savings: DuplicateSavings {
526                            lines_of_code: ast1.children.len().min(ast2.children.len()),
527                            maintenance_effort: "Medium".to_string(),
528                            bug_risk_reduction: "Moderate".to_string(),
529                        },
530                    });
531                }
532            }
533        }
534
535        Ok(duplicates)
536    }
537
538    /// Find semantic duplicates based on functionality
539    fn find_semantic_duplicates(
540        &self,
541        file_contents: &HashMap<std::path::PathBuf, String>,
542        similarity_threshold: f64,
543        min_lines: usize,
544    ) -> Result<Vec<DuplicateResult>> {
545        let mut duplicates = Vec::new();
546        let mut analyzed_pairs = HashSet::new();
547
548        for (file1, content1) in file_contents {
549            for (file2, content2) in file_contents {
550                if file1 >= file2 || analyzed_pairs.contains(&(file1.clone(), file2.clone())) {
551                    continue;
552                }
553                analyzed_pairs.insert((file1.clone(), file2.clone()));
554
555                let similarity = self.calculate_semantic_similarity(content1, content2);
556                if similarity >= similarity_threshold {
557                    let lines1 = content1.lines().count();
558                    let lines2 = content2.lines().count();
559
560                    if lines1 >= min_lines && lines2 >= min_lines {
561                        duplicates.push(DuplicateResult {
562                            similarity_score: similarity,
563                            duplicate_type: DuplicateType::SemanticSimilar,
564                            files: vec![
565                                DuplicateFile {
566                                    path: file1.display().to_string(),
567                                    lines: lines1,
568                                    start_line: None,
569                                    end_line: None,
570                                    complexity_score: self.calculate_complexity_score(content1),
571                                },
572                                DuplicateFile {
573                                    path: file2.display().to_string(),
574                                    lines: lines2,
575                                    start_line: None,
576                                    end_line: None,
577                                    complexity_score: self.calculate_complexity_score(content2),
578                                },
579                            ],
580                            common_patterns: self.identify_semantic_patterns(content1, content2),
581                            refactoring_suggestions: Vec::new(),
582                            confidence_level: similarity * 0.8, // Lower confidence for semantic
583                            estimated_savings: DuplicateSavings {
584                                lines_of_code: lines1.min(lines2) / 2, // Conservative estimate
585                                maintenance_effort: "Medium".to_string(),
586                                bug_risk_reduction: "Low".to_string(),
587                            },
588                        });
589                    }
590                }
591            }
592        }
593
594        Ok(duplicates)
595    }
596
597    /// Find pattern-based duplicates (design patterns, common code structures)
598    fn find_pattern_duplicates(
599        &self,
600        _file_contents: &HashMap<std::path::PathBuf, String>,
601        _similarity_threshold: f64,
602    ) -> Result<Vec<DuplicateResult>> {
603        let duplicates = Vec::new();
604        // Pattern-based detection would analyze for common design patterns
605        // This is a simplified implementation
606        Ok(duplicates)
607    }
608
609    fn calculate_exact_similarity(&self, content1: &str, content2: &str) -> f64 {
610        self.calculate_content_similarity(content1, content2)
611    }
612
613    fn calculate_structural_similarity_ast(&self, ast1: &AstNode, ast2: &AstNode) -> f64 {
614        if ast1.children.is_empty() && ast2.children.is_empty() {
615            return if ast1.normalized_text == ast2.normalized_text {
616                1.0
617            } else {
618                0.0
619            };
620        }
621
622        let mut matches = 0;
623        let total = ast1.children.len().max(ast2.children.len());
624
625        for child1 in &ast1.children {
626            for child2 in &ast2.children {
627                if child1.node_type == child2.node_type
628                    && child1.structural_hash == child2.structural_hash
629                {
630                    matches += 1;
631                    break;
632                }
633            }
634        }
635
636        if total == 0 {
637            0.0
638        } else {
639            matches as f64 / total as f64
640        }
641    }
642
643    fn calculate_semantic_similarity(&self, content1: &str, content2: &str) -> f64 {
644        let mut similarity_score = 0.0;
645        let mut _pattern_matches = 0;
646        let mut total_patterns = 0;
647
648        for patterns in self.semantic_patterns.values() {
649            total_patterns += patterns.len();
650
651            let count1 = patterns
652                .iter()
653                .map(|p| content1.matches(p).count())
654                .sum::<usize>();
655            let count2 = patterns
656                .iter()
657                .map(|p| content2.matches(p).count())
658                .sum::<usize>();
659
660            if count1 > 0 && count2 > 0 {
661                _pattern_matches += patterns.len();
662                similarity_score += (count1.min(count2) as f64) / (count1.max(count2) as f64);
663            }
664        }
665
666        if total_patterns == 0 {
667            0.0
668        } else {
669            similarity_score / total_patterns as f64
670        }
671    }
672
673    /// Legacy method for backward compatibility
674    pub fn find_code_duplicates(
675        &mut self,
676        repo_path: &Path,
677        similarity_threshold: f64,
678        min_lines: usize,
679        exclude_patterns: &[String],
680    ) -> Result<Vec<Value>> {
681        let advanced_results = self.find_code_duplicates_advanced(
682            repo_path,
683            similarity_threshold,
684            min_lines,
685            exclude_patterns,
686        )?;
687
688        // Convert advanced results to legacy format
689        let legacy_results = advanced_results
690            .into_iter()
691            .map(|result| {
692                serde_json::json!({
693                    "similarity": result.similarity_score,
694                    "files": result.files.iter().map(|f| serde_json::json!({
695                        "path": f.path,
696                        "lines": f.lines
697                    })).collect::<Vec<_>>(),
698                    "lines": result.files.iter().map(|f| f.lines).min().unwrap_or(0),
699                    "type": match result.duplicate_type {
700                        DuplicateType::ExactCopy => "exact_copy",
701                        DuplicateType::StructuralSimilar => "structural_similar",
702                        DuplicateType::SemanticSimilar => "semantic_similar",
703                        DuplicateType::PatternDuplicate => "pattern_duplicate",
704                    },
705                    "confidence_level": result.confidence_level,
706                    "refactoring_suggestions": result.refactoring_suggestions.len()
707                })
708            })
709            .collect();
710
711        Ok(legacy_results)
712    }
713
714    fn calculate_complexity_score(&self, content: &str) -> f64 {
715        let lines = content.lines().count();
716        let non_empty_lines = content
717            .lines()
718            .filter(|line| !line.trim().is_empty())
719            .count();
720
721        // Simple complexity metric based on line ratios and control structures
722        let control_structures = content.matches("if").count()
723            + content.matches("for").count()
724            + content.matches("while").count()
725            + content.matches("try").count()
726            + content.matches("catch").count()
727            + content.matches("switch").count();
728        let functions = content.matches("def ").count()
729            + content.matches("function ").count()
730            + content.matches("class ").count();
731
732        let base_complexity = non_empty_lines as f64 / lines.max(1) as f64;
733        let control_complexity = control_structures as f64 / non_empty_lines.max(1) as f64;
734        let function_complexity = functions as f64 / non_empty_lines.max(1) as f64;
735
736        (base_complexity + control_complexity + function_complexity) * 100.0
737    }
738
739    fn calculate_ast_complexity(&self, ast: &AstNode) -> f64 {
740        let mut complexity = 0.0;
741        complexity += ast.children.len() as f64;
742
743        for child in &ast.children {
744            if child.node_type.contains("if")
745                || child.node_type.contains("for")
746                || child.node_type.contains("while")
747            {
748                complexity += 2.0; // Control structures add more complexity
749            } else {
750                complexity += 1.0;
751            }
752        }
753
754        complexity
755    }
756
757    fn identify_common_patterns(&self, content1: &str, content2: &str) -> Vec<String> {
758        let mut patterns = Vec::new();
759
760        // Check for common semantic patterns
761        for (pattern_type, pattern_keywords) in &self.semantic_patterns {
762            let matches1 = pattern_keywords
763                .iter()
764                .filter(|&keyword| content1.contains(keyword))
765                .count();
766            let matches2 = pattern_keywords
767                .iter()
768                .filter(|&keyword| content2.contains(keyword))
769                .count();
770
771            if matches1 > 0 && matches2 > 0 {
772                patterns.push(pattern_type.clone());
773            }
774        }
775
776        // Check for structural patterns
777        let common_keywords = ["function", "class", "if", "for", "while", "try", "catch"];
778        for keyword in &common_keywords {
779            if content1.contains(keyword) && content2.contains(keyword) {
780                patterns.push(keyword.to_string());
781            }
782        }
783
784        patterns
785    }
786
787    fn identify_structural_patterns(&self, ast1: &AstNode, ast2: &AstNode) -> Vec<String> {
788        let mut patterns = Vec::new();
789
790        // Find common node types
791        let types1: HashSet<_> = ast1.children.iter().map(|child| &child.node_type).collect();
792        let types2: HashSet<_> = ast2.children.iter().map(|child| &child.node_type).collect();
793
794        for common_type in types1.intersection(&types2) {
795            patterns.push(format!("structural_{common_type}"));
796        }
797
798        patterns
799    }
800
801    fn identify_semantic_patterns(&self, content1: &str, content2: &str) -> Vec<String> {
802        let mut patterns = Vec::new();
803
804        for (pattern_type, keywords) in &self.semantic_patterns {
805            let score1 = keywords
806                .iter()
807                .map(|k| content1.matches(k).count())
808                .sum::<usize>();
809            let score2 = keywords
810                .iter()
811                .map(|k| content2.matches(k).count())
812                .sum::<usize>();
813
814            if score1 > 0 && score2 > 0 {
815                let similarity = (score1.min(score2) as f64) / (score1.max(score2) as f64);
816                if similarity > 0.5 {
817                    patterns.push(format!("semantic_{pattern_type}"));
818                }
819            }
820        }
821
822        patterns
823    }
824
825    fn deduplicate_results(&self, mut duplicates: Vec<DuplicateResult>) -> Vec<DuplicateResult> {
826        // Remove overlapping duplicates - keep the one with highest confidence
827        duplicates.sort_by(|a, b| b.confidence_level.partial_cmp(&a.confidence_level).unwrap());
828
829        let mut result = Vec::new();
830        let mut seen_files = HashSet::new();
831
832        for duplicate in duplicates {
833            let file_paths: Vec<String> = duplicate.files.iter().map(|f| f.path.clone()).collect();
834
835            if !file_paths.iter().any(|path| seen_files.contains(path)) {
836                for path in &file_paths {
837                    seen_files.insert(path.clone());
838                }
839                result.push(duplicate);
840            }
841        }
842
843        result
844    }
845
846    fn enhance_with_refactoring_suggestions(
847        &self,
848        mut duplicate: DuplicateResult,
849    ) -> DuplicateResult {
850        let mut suggestions = Vec::new();
851
852        match duplicate.duplicate_type {
853            DuplicateType::ExactCopy => {
854                suggestions.push(RefactoringSuggestion {
855                    suggestion_type: "Extract Common Function".to_string(),
856                    description: "Extract the duplicated code into a common function or module"
857                        .to_string(),
858                    estimated_effort: "Low".to_string(),
859                    potential_savings: format!(
860                        "{} lines of code",
861                        duplicate.estimated_savings.lines_of_code
862                    ),
863                    implementation_steps: vec![
864                        "1. Create a new function with the common code".to_string(),
865                        "2. Replace duplicate instances with function calls".to_string(),
866                        "3. Test to ensure functionality is preserved".to_string(),
867                    ],
868                });
869            }
870            DuplicateType::StructuralSimilar => {
871                suggestions.push(RefactoringSuggestion {
872                    suggestion_type: "Create Abstract Base Class".to_string(),
873                    description: "Create a common base class or interface for similar structures"
874                        .to_string(),
875                    estimated_effort: "Medium".to_string(),
876                    potential_savings: format!(
877                        "{} lines of code reduction",
878                        duplicate.estimated_savings.lines_of_code / 2
879                    ),
880                    implementation_steps: vec![
881                        "1. Identify common structural elements".to_string(),
882                        "2. Create base class or interface".to_string(),
883                        "3. Refactor duplicate classes to inherit/implement".to_string(),
884                        "4. Test inheritance hierarchy".to_string(),
885                    ],
886                });
887            }
888            DuplicateType::SemanticSimilar => {
889                suggestions.push(RefactoringSuggestion {
890                    suggestion_type: "Strategy Pattern".to_string(),
891                    description: "Use strategy pattern to handle similar functionality".to_string(),
892                    estimated_effort: "High".to_string(),
893                    potential_savings: "Improved maintainability and reduced complexity"
894                        .to_string(),
895                    implementation_steps: vec![
896                        "1. Define common interface for similar behaviors".to_string(),
897                        "2. Implement concrete strategies".to_string(),
898                        "3. Refactor to use strategy pattern".to_string(),
899                        "4. Add configuration for strategy selection".to_string(),
900                    ],
901                });
902            }
903            DuplicateType::PatternDuplicate => {
904                suggestions.push(RefactoringSuggestion {
905                    suggestion_type: "Template Method".to_string(),
906                    description: "Use template method pattern for common algorithmic structure"
907                        .to_string(),
908                    estimated_effort: "Medium".to_string(),
909                    potential_savings: "Reduced code duplication and improved consistency"
910                        .to_string(),
911                    implementation_steps: vec![
912                        "1. Identify common algorithm structure".to_string(),
913                        "2. Create template method in base class".to_string(),
914                        "3. Implement varying steps in subclasses".to_string(),
915                    ],
916                });
917            }
918        }
919
920        // Add complexity-based suggestions
921        let avg_complexity = duplicate
922            .files
923            .iter()
924            .map(|f| f.complexity_score)
925            .sum::<f64>()
926            / duplicate.files.len() as f64;
927
928        if avg_complexity > 50.0 {
929            suggestions.push(RefactoringSuggestion {
930                suggestion_type: "Simplify Complex Code".to_string(),
931                description:
932                    "Break down complex duplicated code into smaller, more manageable pieces"
933                        .to_string(),
934                estimated_effort: "High".to_string(),
935                potential_savings: "Improved readability and maintainability".to_string(),
936                implementation_steps: vec![
937                    "1. Identify complex sections within duplicates".to_string(),
938                    "2. Extract helper functions".to_string(),
939                    "3. Simplify conditional logic".to_string(),
940                    "4. Add comprehensive tests".to_string(),
941                ],
942            });
943        }
944
945        duplicate.refactoring_suggestions = suggestions;
946        duplicate
947    }
948
949    /// Calculate content similarity between two text blocks using Jaccard coefficient
950    pub fn calculate_content_similarity(&self, content1: &str, content2: &str) -> f64 {
951        let lines1: Vec<String> = content1
952            .lines()
953            .map(|s| s.trim().to_string())
954            .filter(|s| !s.is_empty() && !s.starts_with("//") && !s.starts_with("#"))
955            .collect();
956
957        let lines2: Vec<String> = content2
958            .lines()
959            .map(|s| s.trim().to_string())
960            .filter(|s| !s.is_empty() && !s.starts_with("//") && !s.starts_with("#"))
961            .collect();
962
963        if lines1.is_empty() || lines2.is_empty() {
964            return 0.0;
965        }
966
967        // Jaccard coefficient: intersection / union
968        let set1: HashSet<String> = lines1.into_iter().collect();
969        let set2: HashSet<String> = lines2.into_iter().collect();
970
971        if set1.is_empty() && set2.is_empty() {
972            return 1.0;
973        }
974
975        let intersection = set1.intersection(&set2).count();
976        let union = set1.union(&set2).count();
977
978        if union == 0 {
979            0.0
980        } else {
981            intersection as f64 / union as f64
982        }
983    }
984
985    /// Find duplicate code blocks within files
986    pub fn find_duplicate_blocks(
987        &self,
988        content: &str,
989        min_lines: usize,
990        similarity_threshold: f64,
991    ) -> Result<Vec<Value>> {
992        let mut duplicates = Vec::new();
993        let lines: Vec<&str> = content.lines().collect();
994
995        if lines.len() < min_lines * 2 {
996            return Ok(duplicates);
997        }
998
999        // Compare all possible blocks of minimum size
1000        for i in 0..=lines.len().saturating_sub(min_lines) {
1001            for j in (i + min_lines)..=lines.len().saturating_sub(min_lines) {
1002                let block1 = &lines[i..i + min_lines];
1003                let block2 = &lines[j..j + min_lines];
1004
1005                let block1_text = block1.join("\n");
1006                let block2_text = block2.join("\n");
1007
1008                let similarity = self.calculate_content_similarity(&block1_text, &block2_text);
1009
1010                if similarity >= similarity_threshold {
1011                    duplicates.push(serde_json::json!({
1012                        "similarity": similarity,
1013                        "blocks": [
1014                            {
1015                                "start_line": i + 1,
1016                                "end_line": i + min_lines,
1017                                "content": block1_text
1018                            },
1019                            {
1020                                "start_line": j + 1,
1021                                "end_line": j + min_lines,
1022                                "content": block2_text
1023                            }
1024                        ],
1025                        "type": "block_similarity"
1026                    }));
1027                }
1028            }
1029        }
1030
1031        Ok(duplicates)
1032    }
1033
1034    /// Calculate structural similarity (ignoring variable names)
1035    pub fn calculate_structural_similarity(&self, content1: &str, content2: &str) -> f64 {
1036        // Normalize content by removing variable names and keeping structure
1037        let normalized1 = self.normalize_for_structure(content1);
1038        let normalized2 = self.normalize_for_structure(content2);
1039
1040        self.calculate_content_similarity(&normalized1, &normalized2)
1041    }
1042
1043    /// Normalize content for structural comparison
1044    fn normalize_for_structure(&self, content: &str) -> String {
1045        content
1046            .lines()
1047            .map(|line| {
1048                let trimmed = line.trim();
1049                // Replace identifiers with normalized tokens while keeping structure
1050                let mut normalized = trimmed.to_string();
1051
1052                // Replace common patterns
1053                normalized = regex::Regex::new(r"\b[a-zA-Z_][a-zA-Z0-9_]*\b")
1054                    .unwrap()
1055                    .replace_all(&normalized, "IDENTIFIER")
1056                    .to_string();
1057
1058                normalized = regex::Regex::new(r"\b\d+\b")
1059                    .unwrap()
1060                    .replace_all(&normalized, "NUMBER")
1061                    .to_string();
1062
1063                normalized = regex::Regex::new(r#""[^"]*""#)
1064                    .unwrap()
1065                    .replace_all(&normalized, "STRING")
1066                    .to_string();
1067
1068                normalized
1069            })
1070            .collect::<Vec<_>>()
1071            .join("\n")
1072    }
1073}
1074
1075impl Default for DuplicateAnalyzer {
1076    fn default() -> Self {
1077        Self::new()
1078    }
1079}
1080
1081#[cfg(test)]
1082mod tests {
1083    use super::*;
1084    use std::fs;
1085    use tempfile::tempdir;
1086
1087    #[test]
1088    fn test_content_similarity() {
1089        let analyzer = DuplicateAnalyzer::new();
1090
1091        let content1 = "line1\nline2\nline3";
1092        let content2 = "line1\nline2\nline4";
1093
1094        let similarity = analyzer.calculate_content_similarity(content1, content2);
1095        assert!(similarity > 0.0 && similarity < 1.0);
1096
1097        let identical = analyzer.calculate_content_similarity(content1, content1);
1098        assert_eq!(identical, 1.0);
1099    }
1100
1101    #[test]
1102    fn test_structural_similarity() {
1103        let analyzer = DuplicateAnalyzer::new();
1104
1105        let content1 = "def func1(x, y):\n    return x + y";
1106        let content2 = "def func2(a, b):\n    return a + b";
1107
1108        let similarity = analyzer.calculate_structural_similarity(content1, content2);
1109        assert!(similarity > 0.8); // Should be very similar structurally
1110    }
1111
1112    #[test]
1113    fn test_find_duplicate_blocks() {
1114        let analyzer = DuplicateAnalyzer::new();
1115
1116        let content = "line1\nline2\nline3\nline4\nline1\nline2\nline3\nline5";
1117        let duplicates = analyzer.find_duplicate_blocks(content, 3, 0.8).unwrap();
1118
1119        assert!(!duplicates.is_empty(), "Should find duplicate code");
1120    }
1121
1122    #[test]
1123    fn test_find_code_duplicates() {
1124        let mut analyzer = DuplicateAnalyzer::new();
1125        let temp_dir = tempdir().unwrap();
1126
1127        // Create test files
1128        let file1_path = temp_dir.path().join("file1.py");
1129        let file2_path = temp_dir.path().join("file2.py");
1130
1131        fs::write(&file1_path, "def test():\n    return 1").unwrap();
1132        fs::write(&file2_path, "def test():\n    return 1").unwrap();
1133
1134        let duplicates = analyzer
1135            .find_code_duplicates(temp_dir.path(), 0.8, 1, &[])
1136            .unwrap();
1137        assert!(!duplicates.is_empty(), "Should find duplicate code");
1138    }
1139}
codeprism_analysis/duplicates.rs

codeprism_analysis/
duplicates.rs