1use crate::comply::rule::{
6 FixResult, RuleCategory, RuleResult, RuleViolation, StackComplianceRule, Suggestion,
7 ViolationLevel,
8};
9use std::collections::{HashMap, HashSet};
10use std::path::Path;
11
12#[derive(Debug)]
14pub struct DuplicationRule {
15 similarity_threshold: f64,
17 min_fragment_size: usize,
19 num_permutations: usize,
21 include_patterns: Vec<String>,
23 exclude_patterns: Vec<String>,
25}
26
27impl Default for DuplicationRule {
28 fn default() -> Self {
29 Self::new()
30 }
31}
32
33impl DuplicationRule {
34 pub fn new() -> Self {
36 Self {
37 similarity_threshold: 0.85,
38 min_fragment_size: 50,
39 num_permutations: 128,
40 include_patterns: vec!["**/*.rs".to_string()],
41 exclude_patterns: vec![
42 "**/target/**".to_string(),
43 "**/tests/**".to_string(),
44 "**/benches/**".to_string(),
45 ],
46 }
47 }
48
49 fn extract_fragments(&self, path: &Path) -> anyhow::Result<Vec<CodeFragment>> {
62 let content = std::fs::read_to_string(path)?;
63 let lines: Vec<&str> = content.lines().collect();
64
65 if lines.len() < self.min_fragment_size {
66 return Ok(Vec::new());
67 }
68
69 let mut fragments = Vec::new();
70
71 let mut current_start = 0;
74 let mut in_block = false;
75 let mut block_depth = 0;
76
77 for (i, line) in lines.iter().enumerate() {
78 let trimmed = line.trim();
79
80 block_depth += trimmed.matches('{').count();
82 block_depth = block_depth.saturating_sub(trimmed.matches('}').count());
83
84 if (trimmed.starts_with("fn ")
86 || trimmed.starts_with("pub fn ")
87 || trimmed.starts_with("impl ")
88 || trimmed.starts_with("pub struct ")
89 || trimmed.starts_with("struct "))
90 && !in_block
91 {
92 if i > current_start && i - current_start >= self.min_fragment_size {
94 fragments.push(CodeFragment {
95 path: path.to_path_buf(),
96 start_line: current_start + 1,
97 end_line: i,
98 content: lines[current_start..i].join("\n"),
99 });
100 }
101 current_start = i;
102 in_block = true;
103 }
104
105 if in_block && block_depth == 0 && trimmed.ends_with('}') {
107 if i - current_start >= self.min_fragment_size {
108 fragments.push(CodeFragment {
109 path: path.to_path_buf(),
110 start_line: current_start + 1,
111 end_line: i + 1,
112 content: lines[current_start..=i].join("\n"),
113 });
114 }
115 current_start = i + 1;
116 in_block = false;
117 }
118 }
119
120 if lines.len() - current_start >= self.min_fragment_size {
122 fragments.push(CodeFragment {
123 path: path.to_path_buf(),
124 start_line: current_start + 1,
125 end_line: lines.len(),
126 content: lines[current_start..].join("\n"),
127 });
128 }
129
130 Ok(fragments)
131 }
132
133 fn compute_minhash(&self, fragment: &CodeFragment) -> MinHashSignature {
150 let tokens = self.tokenize(&fragment.content);
152
153 let mut signature = vec![u64::MAX; self.num_permutations];
155
156 for token in tokens {
157 for (i, sig) in signature.iter_mut().enumerate() {
158 let hash = self.hash_token(&token, i as u64);
160 if hash < *sig {
161 *sig = hash;
162 }
163 }
164 }
165
166 MinHashSignature { values: signature }
167 }
168
169 fn tokenize(&self, content: &str) -> Vec<String> {
181 let mut tokens = Vec::new();
182
183 let normalized: String = content
185 .lines()
186 .map(|l| l.trim())
187 .filter(|l| !l.is_empty() && !l.starts_with("//"))
188 .collect::<Vec<_>>()
189 .join(" ");
190
191 let words: Vec<&str> = normalized
193 .split(|c: char| !c.is_alphanumeric() && c != '_')
194 .filter(|w| !w.is_empty())
195 .collect();
196
197 for window in words.windows(3) {
199 tokens.push(window.join(" "));
200 }
201
202 for word in &words {
204 if word.len() > 3 {
205 tokens.push((*word).to_string());
206 }
207 }
208
209 tokens
210 }
211
212 fn hash_token(&self, token: &str, perm: u64) -> u64 {
225 let mut hash: u64 = 0xcbf29ce484222325;
227 hash = hash.wrapping_mul(0x100000001b3);
228 hash ^= perm;
229
230 for byte in token.bytes() {
231 hash ^= byte as u64;
232 hash = hash.wrapping_mul(0x100000001b3);
233 }
234
235 hash
236 }
237
238 fn jaccard_similarity(&self, sig1: &MinHashSignature, sig2: &MinHashSignature) -> f64 {
240 let matches = sig1.values.iter().zip(sig2.values.iter()).filter(|(a, b)| a == b).count();
241
242 matches as f64 / self.num_permutations as f64
243 }
244
245 fn find_duplicates(&self, fragments: &[CodeFragment]) -> Vec<DuplicateCluster> {
247 if fragments.len() < 2 {
248 return Vec::new();
249 }
250
251 let signatures: Vec<MinHashSignature> =
253 fragments.iter().map(|f| self.compute_minhash(f)).collect();
254
255 let mut similar_pairs: Vec<(usize, usize, f64)> = Vec::new();
257
258 let bands = 20;
260 let rows_per_band = self.num_permutations / bands;
261 let mut buckets: HashMap<(usize, Vec<u64>), Vec<usize>> = HashMap::new();
262
263 for (idx, sig) in signatures.iter().enumerate() {
264 for band in 0..bands {
265 let start = band * rows_per_band;
266 let end = start + rows_per_band;
267 let band_hash: Vec<u64> = sig.values[start..end].to_vec();
268 buckets.entry((band, band_hash)).or_default().push(idx);
269 }
270 }
271
272 let mut checked: HashSet<(usize, usize)> = HashSet::new();
274 for indices in buckets.values() {
275 for i in 0..indices.len() {
276 for j in (i + 1)..indices.len() {
277 let idx1 = indices[i];
278 let idx2 = indices[j];
279 let key = (idx1.min(idx2), idx1.max(idx2));
280
281 if checked.contains(&key) {
282 continue;
283 }
284 checked.insert(key);
285
286 let sim = self.jaccard_similarity(&signatures[idx1], &signatures[idx2]);
287 if sim >= self.similarity_threshold {
288 similar_pairs.push((idx1, idx2, sim));
289 }
290 }
291 }
292 }
293
294 let mut clusters = self.cluster_fragments(fragments, &similar_pairs);
296
297 clusters.retain(|c| c.fragments.len() >= 2);
299 clusters.sort_by(|a, b| b.similarity.total_cmp(&a.similarity));
300
301 clusters
302 }
303
304 fn cluster_fragments(
306 &self,
307 fragments: &[CodeFragment],
308 pairs: &[(usize, usize, f64)],
309 ) -> Vec<DuplicateCluster> {
310 let n = fragments.len();
311 let mut parent: Vec<usize> = (0..n).collect();
312 let mut rank: Vec<usize> = vec![0; n];
313
314 fn find(parent: &mut [usize], x: usize) -> usize {
315 if parent[x] != x {
316 parent[x] = find(parent, parent[x]);
317 }
318 parent[x]
319 }
320
321 fn union(parent: &mut [usize], rank: &mut [usize], x: usize, y: usize) {
322 let px = find(parent, x);
323 let py = find(parent, y);
324 if px == py {
325 return;
326 }
327 match rank[px].cmp(&rank[py]) {
328 std::cmp::Ordering::Less => parent[px] = py,
329 std::cmp::Ordering::Greater => parent[py] = px,
330 std::cmp::Ordering::Equal => {
331 parent[py] = px;
332 rank[px] += 1;
333 }
334 }
335 }
336
337 for (i, j, _sim) in pairs {
339 union(&mut parent, &mut rank, *i, *j);
340 }
341
342 let mut cluster_map: HashMap<usize, Vec<(usize, f64)>> = HashMap::new();
344 for (i, j, sim) in pairs {
345 let root = find(&mut parent, *i);
346 cluster_map.entry(root).or_default().push((*i, *sim));
347 cluster_map.entry(root).or_default().push((*j, *sim));
348 }
349
350 let mut clusters = Vec::new();
352 for (_root, members) in cluster_map {
353 let mut seen = HashSet::new();
354 let mut cluster_fragments = Vec::new();
355 let mut max_sim = 0.0f64;
356
357 for (idx, sim) in members {
358 if seen.insert(idx) {
359 cluster_fragments.push(fragments[idx].clone());
360 max_sim = max_sim.max(sim);
361 }
362 }
363
364 if cluster_fragments.len() >= 2 {
365 clusters
366 .push(DuplicateCluster { fragments: cluster_fragments, similarity: max_sim });
367 }
368 }
369
370 clusters
371 }
372
373 fn should_include(&self, path: &Path) -> bool {
375 let path_str = path.to_string_lossy();
376
377 for pattern in &self.exclude_patterns {
379 if glob_match(pattern, &path_str) {
380 return false;
381 }
382 }
383
384 for pattern in &self.include_patterns {
386 if glob_match(pattern, &path_str) {
387 return true;
388 }
389 }
390
391 false
392 }
393}
394
395fn glob_match(pattern: &str, path: &str) -> bool {
397 let pattern_parts: Vec<&str> = pattern.split('/').collect();
398 let path_parts: Vec<&str> = path.split('/').collect();
399
400 glob_match_parts(&pattern_parts, &path_parts)
401}
402
403fn glob_match_doublestar(pattern: &[&str], path: &[&str]) -> bool {
405 let pattern_rest = pattern.get(1..).unwrap_or(&[]);
406 let path_rest = path.get(1..).unwrap_or(&[]);
407 if glob_match_parts(pattern_rest, path) {
409 return true;
410 }
411 if !path.is_empty() && glob_match_parts(pattern, path_rest) {
413 return true;
414 }
415 !path.is_empty() && glob_match_parts(pattern_rest, path_rest)
417}
418
419fn glob_match_parts(pattern: &[&str], path: &[&str]) -> bool {
420 if pattern.is_empty() {
421 return path.is_empty();
422 }
423
424 if path.is_empty() {
425 return pattern.iter().all(|p| *p == "**");
426 }
427
428 let pat_first = match pattern.first() {
429 Some(p) => *p,
430 None => return path.is_empty(),
431 };
432
433 if pat_first == "**" {
434 return glob_match_doublestar(pattern, path);
435 }
436
437 let path_first = match path.first() {
438 Some(p) => *p,
439 None => return false,
440 };
441 let pattern_rest = pattern.get(1..).unwrap_or(&[]);
442 let path_rest = path.get(1..).unwrap_or(&[]);
443
444 segment_match(pat_first, path_first) && glob_match_parts(pattern_rest, path_rest)
446}
447
448fn segment_match(pattern: &str, segment: &str) -> bool {
449 if pattern == "*" {
450 return true;
451 }
452
453 if pattern.contains('*') {
454 let parts: Vec<&str> = pattern.split('*').collect();
456 if parts.len() == 2 {
457 segment.starts_with(parts[0]) && segment.ends_with(parts[1])
458 } else {
459 pattern == segment
460 }
461 } else {
462 pattern == segment
463 }
464}
465
466#[derive(Debug, Clone)]
468struct CodeFragment {
469 path: std::path::PathBuf,
470 start_line: usize,
471 end_line: usize,
472 content: String,
473}
474
475#[derive(Debug)]
477struct MinHashSignature {
478 values: Vec<u64>,
479}
480
481#[derive(Debug)]
483struct DuplicateCluster {
484 fragments: Vec<CodeFragment>,
485 similarity: f64,
486}
487
488impl StackComplianceRule for DuplicationRule {
489 fn id(&self) -> &'static str {
490 "code-duplication"
491 }
492
493 fn description(&self) -> &'static str {
494 "Detects significant code duplication using MinHash+LSH"
495 }
496
497 fn help(&self) -> Option<&str> {
498 Some(
499 "Threshold: 85% similarity, Minimum: 50 lines\n\
500 Uses MinHash+LSH for efficient near-duplicate detection",
501 )
502 }
503
504 fn category(&self) -> RuleCategory {
505 RuleCategory::Code
506 }
507
508 fn check(&self, project_path: &Path) -> anyhow::Result<RuleResult> {
509 let mut fragments = Vec::new();
511
512 for entry in walkdir::WalkDir::new(project_path).into_iter().filter_map(|e| e.ok()) {
513 let path = entry.path();
514 if path.is_file() && self.should_include(path) {
515 match self.extract_fragments(path) {
516 Ok(frags) => fragments.extend(frags),
517 Err(_) => continue, }
519 }
520 }
521
522 let clusters = self.find_duplicates(&fragments);
524
525 if clusters.is_empty() {
526 return Ok(RuleResult::pass());
527 }
528
529 let mut violations = Vec::new();
530 let mut suggestions = Vec::new();
531
532 for (i, cluster) in clusters.iter().enumerate() {
533 if cluster.similarity >= 0.95 {
535 let locations: Vec<String> = cluster
536 .fragments
537 .iter()
538 .map(|f| format!("{}:{}-{}", f.path.display(), f.start_line, f.end_line))
539 .collect();
540
541 violations.push(
542 RuleViolation::new(
543 format!("DUP-{:03}", i + 1),
544 format!(
545 "High code duplication ({:.0}%) across {} locations",
546 cluster.similarity * 100.0,
547 cluster.fragments.len()
548 ),
549 )
550 .with_severity(ViolationLevel::Warning)
551 .with_location(locations.join(", ")),
552 );
553 } else {
554 let locations: Vec<String> = cluster
556 .fragments
557 .iter()
558 .take(3)
559 .map(|f| format!("{}:{}", f.path.display(), f.start_line))
560 .collect();
561
562 suggestions.push(
563 Suggestion::new(format!(
564 "Similar code ({:.0}%) found in {} locations: {}",
565 cluster.similarity * 100.0,
566 cluster.fragments.len(),
567 locations.join(", ")
568 ))
569 .with_fix("Consider extracting to a shared module".to_string()),
570 );
571 }
572 }
573
574 if violations.is_empty() {
575 Ok(RuleResult::pass_with_suggestions(suggestions))
576 } else {
577 let mut result = RuleResult::fail(violations);
578 result.suggestions = suggestions;
579 Ok(result)
580 }
581 }
582
583 fn can_fix(&self) -> bool {
584 false }
586
587 fn fix(&self, _project_path: &Path) -> anyhow::Result<FixResult> {
588 Ok(FixResult::failure(
589 "Auto-fix not supported for code duplication - manual refactoring required",
590 ))
591 }
592}
593
594#[cfg(test)]
595#[path = "duplication_tests.rs"]
596mod tests;