Skip to main content

batuta/comply/rules/
duplication.rs

1//! Code Duplication Detection Rule
2//!
3//! Detects significant code duplication across PAIML stack projects using MinHash+LSH.
4
5use crate::comply::rule::{
6    FixResult, RuleCategory, RuleResult, RuleViolation, StackComplianceRule, Suggestion,
7    ViolationLevel,
8};
9use std::collections::{HashMap, HashSet};
10use std::path::Path;
11
12/// Code duplication detection rule using MinHash+LSH
13#[derive(Debug)]
14pub struct DuplicationRule {
15    /// Similarity threshold (0.0 to 1.0)
16    similarity_threshold: f64,
17    /// Minimum fragment size in lines
18    min_fragment_size: usize,
19    /// Number of MinHash permutations
20    num_permutations: usize,
21    /// File patterns to include
22    include_patterns: Vec<String>,
23    /// File patterns to exclude
24    exclude_patterns: Vec<String>,
25}
26
27impl Default for DuplicationRule {
28    fn default() -> Self {
29        Self::new()
30    }
31}
32
33impl DuplicationRule {
34    /// Create a new duplication rule with default configuration
35    pub fn new() -> Self {
36        Self {
37            similarity_threshold: 0.85,
38            min_fragment_size: 50,
39            num_permutations: 128,
40            include_patterns: vec!["**/*.rs".to_string()],
41            exclude_patterns: vec![
42                "**/target/**".to_string(),
43                "**/tests/**".to_string(),
44                "**/benches/**".to_string(),
45            ],
46        }
47    }
48
49    /// Extract code fragments from a file for duplication analysis.
50    ///
51    /// Parses the file and extracts logical code blocks (functions, impls, structs)
52    /// that meet the minimum fragment size requirement. Uses a sliding window
53    /// approach with block depth tracking to identify boundaries.
54    ///
55    /// # Arguments
56    /// * `path` - Path to the source file to analyze
57    ///
58    /// # Returns
59    /// * `Ok(Vec<CodeFragment>)` - Extracted fragments meeting size threshold
60    /// * `Err` - If file cannot be read
61    fn extract_fragments(&self, path: &Path) -> anyhow::Result<Vec<CodeFragment>> {
62        let content = std::fs::read_to_string(path)?;
63        let lines: Vec<&str> = content.lines().collect();
64
65        if lines.len() < self.min_fragment_size {
66            return Ok(Vec::new());
67        }
68
69        let mut fragments = Vec::new();
70
71        // Use sliding window to extract fragments
72        // For efficiency, use function/impl boundaries when possible
73        let mut current_start = 0;
74        let mut in_block = false;
75        let mut block_depth = 0;
76
77        for (i, line) in lines.iter().enumerate() {
78            let trimmed = line.trim();
79
80            // Track block depth
81            block_depth += trimmed.matches('{').count();
82            block_depth = block_depth.saturating_sub(trimmed.matches('}').count());
83
84            // Detect function/impl boundaries
85            if (trimmed.starts_with("fn ")
86                || trimmed.starts_with("pub fn ")
87                || trimmed.starts_with("impl ")
88                || trimmed.starts_with("pub struct ")
89                || trimmed.starts_with("struct "))
90                && !in_block
91            {
92                // Start new fragment
93                if i > current_start && i - current_start >= self.min_fragment_size {
94                    fragments.push(CodeFragment {
95                        path: path.to_path_buf(),
96                        start_line: current_start + 1,
97                        end_line: i,
98                        content: lines[current_start..i].join("\n"),
99                    });
100                }
101                current_start = i;
102                in_block = true;
103            }
104
105            // End of block
106            if in_block && block_depth == 0 && trimmed.ends_with('}') {
107                if i - current_start >= self.min_fragment_size {
108                    fragments.push(CodeFragment {
109                        path: path.to_path_buf(),
110                        start_line: current_start + 1,
111                        end_line: i + 1,
112                        content: lines[current_start..=i].join("\n"),
113                    });
114                }
115                current_start = i + 1;
116                in_block = false;
117            }
118        }
119
120        // Capture remaining content
121        if lines.len() - current_start >= self.min_fragment_size {
122            fragments.push(CodeFragment {
123                path: path.to_path_buf(),
124                start_line: current_start + 1,
125                end_line: lines.len(),
126                content: lines[current_start..].join("\n"),
127            });
128        }
129
130        Ok(fragments)
131    }
132
133    /// Compute MinHash signature for a code fragment.
134    ///
135    /// Uses locality-sensitive hashing to create a compact signature
136    /// that can be compared for similarity. Fragments with similar
137    /// content will have similar signatures.
138    ///
139    /// # Algorithm
140    /// 1. Tokenize content into n-grams
141    /// 2. Hash each token with multiple permutation functions
142    /// 3. Keep minimum hash for each permutation (MinHash property)
143    ///
144    /// # Arguments
145    /// * `fragment` - Code fragment to compute signature for
146    ///
147    /// # Returns
148    /// MinHash signature with `num_permutations` values
149    fn compute_minhash(&self, fragment: &CodeFragment) -> MinHashSignature {
150        // Tokenize: extract words and n-grams
151        let tokens = self.tokenize(&fragment.content);
152
153        // Compute MinHash using multiple hash functions
154        let mut signature = vec![u64::MAX; self.num_permutations];
155
156        for token in tokens {
157            for (i, sig) in signature.iter_mut().enumerate() {
158                // Simple hash combining token and permutation index
159                let hash = self.hash_token(&token, i as u64);
160                if hash < *sig {
161                    *sig = hash;
162                }
163            }
164        }
165
166        MinHashSignature { values: signature }
167    }
168
169    /// Tokenize code into n-grams for MinHash computation.
170    ///
171    /// Normalizes code by removing comments and extra whitespace,
172    /// then extracts both individual tokens and 3-grams (sequences
173    /// of 3 words) to capture structural similarity.
174    ///
175    /// # Arguments
176    /// * `content` - Raw code content to tokenize
177    ///
178    /// # Returns
179    /// Vector of token strings (words and 3-grams)
180    fn tokenize(&self, content: &str) -> Vec<String> {
181        let mut tokens = Vec::new();
182
183        // Normalize: lowercase, remove extra whitespace
184        let normalized: String = content
185            .lines()
186            .map(|l| l.trim())
187            .filter(|l| !l.is_empty() && !l.starts_with("//"))
188            .collect::<Vec<_>>()
189            .join(" ");
190
191        // Extract words
192        let words: Vec<&str> = normalized
193            .split(|c: char| !c.is_alphanumeric() && c != '_')
194            .filter(|w| !w.is_empty())
195            .collect();
196
197        // Generate 3-grams
198        for window in words.windows(3) {
199            tokens.push(window.join(" "));
200        }
201
202        // Also add individual significant tokens
203        for word in &words {
204            if word.len() > 3 {
205                tokens.push((*word).to_string());
206            }
207        }
208
209        tokens
210    }
211
212    /// Hash a token with a permutation index.
213    ///
214    /// Uses FNV-1a hash combined with the permutation index to create
215    /// different hash functions for MinHash. This simulates independent
216    /// hash functions required by the MinHash algorithm.
217    ///
218    /// # Arguments
219    /// * `token` - Token string to hash
220    /// * `perm` - Permutation index (0 to num_permutations-1)
221    ///
222    /// # Returns
223    /// 64-bit hash value
224    fn hash_token(&self, token: &str, perm: u64) -> u64 {
225        // FNV-1a hash with permutation mixing
226        let mut hash: u64 = 0xcbf29ce484222325;
227        hash = hash.wrapping_mul(0x100000001b3);
228        hash ^= perm;
229
230        for byte in token.bytes() {
231            hash ^= byte as u64;
232            hash = hash.wrapping_mul(0x100000001b3);
233        }
234
235        hash
236    }
237
238    /// Compute Jaccard similarity from MinHash signatures
239    fn jaccard_similarity(&self, sig1: &MinHashSignature, sig2: &MinHashSignature) -> f64 {
240        let matches = sig1.values.iter().zip(sig2.values.iter()).filter(|(a, b)| a == b).count();
241
242        matches as f64 / self.num_permutations as f64
243    }
244
245    /// Find similar fragments using LSH
246    fn find_duplicates(&self, fragments: &[CodeFragment]) -> Vec<DuplicateCluster> {
247        if fragments.len() < 2 {
248            return Vec::new();
249        }
250
251        // Compute signatures for all fragments
252        let signatures: Vec<MinHashSignature> =
253            fragments.iter().map(|f| self.compute_minhash(f)).collect();
254
255        // Use LSH to find candidate pairs
256        let mut similar_pairs: Vec<(usize, usize, f64)> = Vec::new();
257
258        // Band-based LSH for faster candidate generation
259        let bands = 20;
260        let rows_per_band = self.num_permutations / bands;
261        let mut buckets: HashMap<(usize, Vec<u64>), Vec<usize>> = HashMap::new();
262
263        for (idx, sig) in signatures.iter().enumerate() {
264            for band in 0..bands {
265                let start = band * rows_per_band;
266                let end = start + rows_per_band;
267                let band_hash: Vec<u64> = sig.values[start..end].to_vec();
268                buckets.entry((band, band_hash)).or_default().push(idx);
269            }
270        }
271
272        // Check candidates
273        let mut checked: HashSet<(usize, usize)> = HashSet::new();
274        for indices in buckets.values() {
275            for i in 0..indices.len() {
276                for j in (i + 1)..indices.len() {
277                    let idx1 = indices[i];
278                    let idx2 = indices[j];
279                    let key = (idx1.min(idx2), idx1.max(idx2));
280
281                    if checked.contains(&key) {
282                        continue;
283                    }
284                    checked.insert(key);
285
286                    let sim = self.jaccard_similarity(&signatures[idx1], &signatures[idx2]);
287                    if sim >= self.similarity_threshold {
288                        similar_pairs.push((idx1, idx2, sim));
289                    }
290                }
291            }
292        }
293
294        // Cluster similar fragments using union-find
295        let mut clusters = self.cluster_fragments(fragments, &similar_pairs);
296
297        // Filter to only significant clusters
298        clusters.retain(|c| c.fragments.len() >= 2);
299        clusters.sort_by(|a, b| b.similarity.total_cmp(&a.similarity));
300
301        clusters
302    }
303
304    /// Cluster fragments using union-find
305    fn cluster_fragments(
306        &self,
307        fragments: &[CodeFragment],
308        pairs: &[(usize, usize, f64)],
309    ) -> Vec<DuplicateCluster> {
310        let n = fragments.len();
311        let mut parent: Vec<usize> = (0..n).collect();
312        let mut rank: Vec<usize> = vec![0; n];
313
314        fn find(parent: &mut [usize], x: usize) -> usize {
315            if parent[x] != x {
316                parent[x] = find(parent, parent[x]);
317            }
318            parent[x]
319        }
320
321        fn union(parent: &mut [usize], rank: &mut [usize], x: usize, y: usize) {
322            let px = find(parent, x);
323            let py = find(parent, y);
324            if px == py {
325                return;
326            }
327            match rank[px].cmp(&rank[py]) {
328                std::cmp::Ordering::Less => parent[px] = py,
329                std::cmp::Ordering::Greater => parent[py] = px,
330                std::cmp::Ordering::Equal => {
331                    parent[py] = px;
332                    rank[px] += 1;
333                }
334            }
335        }
336
337        // Union similar pairs
338        for (i, j, _sim) in pairs {
339            union(&mut parent, &mut rank, *i, *j);
340        }
341
342        // Group by cluster
343        let mut cluster_map: HashMap<usize, Vec<(usize, f64)>> = HashMap::new();
344        for (i, j, sim) in pairs {
345            let root = find(&mut parent, *i);
346            cluster_map.entry(root).or_default().push((*i, *sim));
347            cluster_map.entry(root).or_default().push((*j, *sim));
348        }
349
350        // Build clusters
351        let mut clusters = Vec::new();
352        for (_root, members) in cluster_map {
353            let mut seen = HashSet::new();
354            let mut cluster_fragments = Vec::new();
355            let mut max_sim = 0.0f64;
356
357            for (idx, sim) in members {
358                if seen.insert(idx) {
359                    cluster_fragments.push(fragments[idx].clone());
360                    max_sim = max_sim.max(sim);
361                }
362            }
363
364            if cluster_fragments.len() >= 2 {
365                clusters
366                    .push(DuplicateCluster { fragments: cluster_fragments, similarity: max_sim });
367            }
368        }
369
370        clusters
371    }
372
373    /// Check if a file matches include/exclude patterns
374    fn should_include(&self, path: &Path) -> bool {
375        let path_str = path.to_string_lossy();
376
377        // Check exclude patterns first
378        for pattern in &self.exclude_patterns {
379            if glob_match(pattern, &path_str) {
380                return false;
381            }
382        }
383
384        // Check include patterns
385        for pattern in &self.include_patterns {
386            if glob_match(pattern, &path_str) {
387                return true;
388            }
389        }
390
391        false
392    }
393}
394
395/// Simple glob matching (supports ** and *)
396fn glob_match(pattern: &str, path: &str) -> bool {
397    let pattern_parts: Vec<&str> = pattern.split('/').collect();
398    let path_parts: Vec<&str> = path.split('/').collect();
399
400    glob_match_parts(&pattern_parts, &path_parts)
401}
402
403/// Handle ** glob pattern matching (recursive)
404fn glob_match_doublestar(pattern: &[&str], path: &[&str]) -> bool {
405    let pattern_rest = pattern.get(1..).unwrap_or(&[]);
406    let path_rest = path.get(1..).unwrap_or(&[]);
407    // ** matches zero directories: try rest of pattern with current path
408    if glob_match_parts(pattern_rest, path) {
409        return true;
410    }
411    // ** matches one+ directories: try same pattern with rest of path
412    if !path.is_empty() && glob_match_parts(pattern, path_rest) {
413        return true;
414    }
415    // ** matches current and move both forward
416    !path.is_empty() && glob_match_parts(pattern_rest, path_rest)
417}
418
419fn glob_match_parts(pattern: &[&str], path: &[&str]) -> bool {
420    if pattern.is_empty() {
421        return path.is_empty();
422    }
423
424    if path.is_empty() {
425        return pattern.iter().all(|p| *p == "**");
426    }
427
428    let pat_first = match pattern.first() {
429        Some(p) => *p,
430        None => return path.is_empty(),
431    };
432
433    if pat_first == "**" {
434        return glob_match_doublestar(pattern, path);
435    }
436
437    let path_first = match path.first() {
438        Some(p) => *p,
439        None => return false,
440    };
441    let pattern_rest = pattern.get(1..).unwrap_or(&[]);
442    let path_rest = path.get(1..).unwrap_or(&[]);
443
444    // Regular segment: must match and continue
445    segment_match(pat_first, path_first) && glob_match_parts(pattern_rest, path_rest)
446}
447
448fn segment_match(pattern: &str, segment: &str) -> bool {
449    if pattern == "*" {
450        return true;
451    }
452
453    if pattern.contains('*') {
454        // Simple wildcard matching
455        let parts: Vec<&str> = pattern.split('*').collect();
456        if parts.len() == 2 {
457            segment.starts_with(parts[0]) && segment.ends_with(parts[1])
458        } else {
459            pattern == segment
460        }
461    } else {
462        pattern == segment
463    }
464}
465
466/// A code fragment for duplication analysis
467#[derive(Debug, Clone)]
468struct CodeFragment {
469    path: std::path::PathBuf,
470    start_line: usize,
471    end_line: usize,
472    content: String,
473}
474
475/// MinHash signature
476#[derive(Debug)]
477struct MinHashSignature {
478    values: Vec<u64>,
479}
480
481/// A cluster of duplicate code fragments
482#[derive(Debug)]
483struct DuplicateCluster {
484    fragments: Vec<CodeFragment>,
485    similarity: f64,
486}
487
488impl StackComplianceRule for DuplicationRule {
489    fn id(&self) -> &'static str {
490        "code-duplication"
491    }
492
493    fn description(&self) -> &'static str {
494        "Detects significant code duplication using MinHash+LSH"
495    }
496
497    fn help(&self) -> Option<&str> {
498        Some(
499            "Threshold: 85% similarity, Minimum: 50 lines\n\
500             Uses MinHash+LSH for efficient near-duplicate detection",
501        )
502    }
503
504    fn category(&self) -> RuleCategory {
505        RuleCategory::Code
506    }
507
508    fn check(&self, project_path: &Path) -> anyhow::Result<RuleResult> {
509        // Collect all source files
510        let mut fragments = Vec::new();
511
512        for entry in walkdir::WalkDir::new(project_path).into_iter().filter_map(|e| e.ok()) {
513            let path = entry.path();
514            if path.is_file() && self.should_include(path) {
515                match self.extract_fragments(path) {
516                    Ok(frags) => fragments.extend(frags),
517                    Err(_) => continue, // Skip files that can't be read
518                }
519            }
520        }
521
522        // Find duplicates
523        let clusters = self.find_duplicates(&fragments);
524
525        if clusters.is_empty() {
526            return Ok(RuleResult::pass());
527        }
528
529        let mut violations = Vec::new();
530        let mut suggestions = Vec::new();
531
532        for (i, cluster) in clusters.iter().enumerate() {
533            // Only report violations for very high similarity (likely copy-paste)
534            if cluster.similarity >= 0.95 {
535                let locations: Vec<String> = cluster
536                    .fragments
537                    .iter()
538                    .map(|f| format!("{}:{}-{}", f.path.display(), f.start_line, f.end_line))
539                    .collect();
540
541                violations.push(
542                    RuleViolation::new(
543                        format!("DUP-{:03}", i + 1),
544                        format!(
545                            "High code duplication ({:.0}%) across {} locations",
546                            cluster.similarity * 100.0,
547                            cluster.fragments.len()
548                        ),
549                    )
550                    .with_severity(ViolationLevel::Warning)
551                    .with_location(locations.join(", ")),
552                );
553            } else {
554                // Lower similarity is a suggestion
555                let locations: Vec<String> = cluster
556                    .fragments
557                    .iter()
558                    .take(3)
559                    .map(|f| format!("{}:{}", f.path.display(), f.start_line))
560                    .collect();
561
562                suggestions.push(
563                    Suggestion::new(format!(
564                        "Similar code ({:.0}%) found in {} locations: {}",
565                        cluster.similarity * 100.0,
566                        cluster.fragments.len(),
567                        locations.join(", ")
568                    ))
569                    .with_fix("Consider extracting to a shared module".to_string()),
570                );
571            }
572        }
573
574        if violations.is_empty() {
575            Ok(RuleResult::pass_with_suggestions(suggestions))
576        } else {
577            let mut result = RuleResult::fail(violations);
578            result.suggestions = suggestions;
579            Ok(result)
580        }
581    }
582
583    fn can_fix(&self) -> bool {
584        false // Duplication requires manual refactoring
585    }
586
587    fn fix(&self, _project_path: &Path) -> anyhow::Result<FixResult> {
588        Ok(FixResult::failure(
589            "Auto-fix not supported for code duplication - manual refactoring required",
590        ))
591    }
592}
593
594#[cfg(test)]
595#[path = "duplication_tests.rs"]
596mod tests;