1use crate::model::{
2 BucketKey, CloneCluster, CloneConfig, CloneMatch, CloneType, Confidence, Edge, EdgeKind,
3 Language, StructuralFingerprint, SymbolNode,
4};
5use std::collections::{HashMap, HashSet};
6
7fn edge_kind_index(kind: &EdgeKind) -> u32 {
13 match kind {
14 EdgeKind::Contains => 0,
15 EdgeKind::ChildOf => 1,
16 EdgeKind::Calls => 2,
17 EdgeKind::ImportsFrom => 3,
18 EdgeKind::Extends => 4,
19 EdgeKind::Implements => 5,
20 EdgeKind::TestedBy => 6,
21 EdgeKind::DependsOn => 7,
22 EdgeKind::BarrelReExportAll => 8,
23 EdgeKind::ConditionalImport => 9,
24 EdgeKind::SideEffectImport => 10,
25 EdgeKind::DotImport => 11,
26 EdgeKind::HasDecorator => 12,
27 EdgeKind::Embeds => 13,
28 EdgeKind::TypeReference => 14,
29 EdgeKind::ReExport => 15,
30 }
31}
32
33pub fn compute_fingerprints(
39 symbols: &[SymbolNode],
40 edges: &[Edge],
41 config: &CloneConfig,
42) -> Vec<StructuralFingerprint> {
43 let mut outgoing: HashMap<&str, Vec<&Edge>> = HashMap::new();
45 let mut incoming: HashMap<&str, Vec<&Edge>> = HashMap::new();
47
48 for edge in edges {
49 outgoing.entry(edge.source.as_str()).or_default().push(edge);
50 incoming.entry(edge.target.as_str()).or_default().push(edge);
51 }
52
53 let mut fingerprints = Vec::with_capacity(symbols.len());
54
55 for sym in symbols {
56 let body_line_count = sym
57 .location
58 .line_end
59 .saturating_sub(sym.location.line_start)
60 + 1;
61
62 if body_line_count < config.min_lines {
63 continue;
64 }
65
66 let empty = Vec::new();
67 let out_edges = outgoing.get(sym.qualified_name.as_str()).unwrap_or(&empty);
68 let in_edges = incoming.get(sym.qualified_name.as_str()).unwrap_or(&empty);
69
70 let callee_count = out_edges
72 .iter()
73 .filter(|e| e.kind.confidence() >= Confidence::High)
74 .count();
75
76 let caller_count = in_edges
78 .iter()
79 .filter(|e| e.kind.confidence() >= Confidence::High)
80 .count();
81
82 let child_count = out_edges
84 .iter()
85 .filter(|e| e.kind == EdgeKind::Contains)
86 .count();
87
88 let mut edge_kind_set: u32 = 0;
90 for e in out_edges.iter() {
91 edge_kind_set |= 1u32 << edge_kind_index(&e.kind);
92 }
93
94 let language = Language::from_path(&sym.location.file);
95
96 fingerprints.push(StructuralFingerprint {
97 qualified_name: sym.qualified_name.clone(),
98 symbol_kind: sym.kind,
99 callee_count,
100 caller_count,
101 edge_kind_set,
102 body_line_count,
103 child_count,
104 language,
105 file: sym.location.file.clone(),
106 line_start: sym.location.line_start,
107 line_end: sym.location.line_end,
108 });
109 }
110
111 fingerprints
112}
113
114pub fn tokenize(source: &str) -> Vec<String> {
120 let mut tokens = Vec::new();
121
122 for line in source.lines() {
123 let stripped = if let Some(pos) = line.find("//") {
125 &line[..pos]
126 } else if let Some(pos) = line.find('#') {
127 &line[..pos]
128 } else {
129 line
130 };
131
132 for word in stripped.split_whitespace() {
133 split_on_punctuation(word, &mut tokens);
135 }
136 }
137
138 tokens
139}
140
141fn split_on_punctuation(word: &str, out: &mut Vec<String>) {
143 let mut current = String::new();
144
145 for ch in word.chars() {
146 if ch.is_ascii_punctuation() {
147 if !current.is_empty() {
149 out.push(current.clone());
150 current.clear();
151 }
152 out.push(ch.to_string());
154 } else {
155 current.push(ch);
156 }
157 }
158
159 if !current.is_empty() {
160 out.push(current);
161 }
162}
163
164pub fn normalize_identifiers(tokens: &[String]) -> Vec<String> {
168 let keywords: HashSet<&str> = [
169 "fn",
171 "let",
172 "mut",
173 "const",
174 "struct",
175 "enum",
176 "impl",
177 "trait",
178 "pub",
179 "use",
180 "mod",
181 "if",
182 "else",
183 "match",
184 "for",
185 "while",
186 "loop",
187 "return",
188 "break",
189 "continue",
190 "where",
191 "async",
192 "await",
193 "move",
194 "ref",
195 "type",
196 "self",
197 "super",
198 "crate",
199 "function",
201 "var",
202 "class",
203 "interface",
204 "export",
205 "import",
206 "from",
207 "default",
208 "extends",
209 "implements",
210 "new",
211 "this",
212 "typeof",
213 "instanceof",
214 "void",
215 "null",
216 "undefined",
217 "true",
218 "false",
219 "try",
220 "catch",
221 "throw",
222 "finally",
223 "def",
225 "class",
226 "import",
227 "from",
228 "return",
229 "if",
230 "elif",
231 "else",
232 "for",
233 "while",
234 "with",
235 "as",
236 "try",
237 "except",
238 "raise",
239 "pass",
240 "None",
241 "True",
242 "False",
243 "lambda",
244 "func",
246 "package",
247 "import",
248 "type",
249 "struct",
250 "interface",
251 "map",
252 "chan",
253 "go",
254 "defer",
255 "select",
256 "case",
257 "range",
258 "nil",
259 ]
260 .iter()
261 .copied()
262 .collect();
263
264 let mut identifier_map: HashMap<String, usize> = HashMap::new();
265 let mut result = Vec::with_capacity(tokens.len());
266
267 for token in tokens {
268 if keywords.contains(token.as_str()) {
270 result.push(token.clone());
271 continue;
272 }
273
274 if token.chars().all(|c| c.is_ascii_punctuation()) {
276 result.push(token.clone());
277 continue;
278 }
279
280 if token
282 .chars()
283 .next()
284 .map(|c| c.is_ascii_digit())
285 .unwrap_or(false)
286 {
287 result.push(token.clone());
288 continue;
289 }
290
291 let next_id = identifier_map.len() + 1;
293 let idx = identifier_map.entry(token.clone()).or_insert(next_id);
294 result.push(format!("_{}", idx));
295 }
296
297 result
298}
299
300pub fn jaccard_similarity(a: &[String], b: &[String]) -> f64 {
305 if a.is_empty() && b.is_empty() {
306 return 1.0;
307 }
308 if a.is_empty() || b.is_empty() {
309 return 0.0;
310 }
311
312 let mut counts_a: HashMap<&str, usize> = HashMap::new();
313 let mut counts_b: HashMap<&str, usize> = HashMap::new();
314
315 for token in a {
316 *counts_a.entry(token.as_str()).or_insert(0) += 1;
317 }
318 for token in b {
319 *counts_b.entry(token.as_str()).or_insert(0) += 1;
320 }
321
322 let all_tokens: HashSet<&str> = counts_a.keys().chain(counts_b.keys()).copied().collect();
324
325 let mut intersection = 0usize;
326 let mut union = 0usize;
327
328 for token in &all_tokens {
329 let ca = counts_a.get(token).copied().unwrap_or(0);
330 let cb = counts_b.get(token).copied().unwrap_or(0);
331 intersection += ca.min(cb);
332 union += ca.max(cb);
333 }
334
335 if union == 0 {
336 return 1.0;
337 }
338
339 intersection as f64 / union as f64
340}
341
342fn count_bin(count: usize) -> u8 {
349 match count {
350 0 => 0,
351 1..=2 => 1,
352 3..=5 => 2,
353 _ => 3,
354 }
355}
356
357fn line_bin(lines: usize) -> u8 {
360 match lines {
361 0..=5 => 0,
362 6..=15 => 1,
363 16..=50 => 2,
364 _ => 3,
365 }
366}
367
368fn child_bin(count: usize) -> u8 {
371 match count {
372 0 => 0,
373 1..=3 => 1,
374 _ => 2,
375 }
376}
377
378pub fn bucket_key(fp: &StructuralFingerprint) -> BucketKey {
380 BucketKey {
381 kind: fp.symbol_kind,
382 callee_bin: count_bin(fp.callee_count),
383 caller_bin: count_bin(fp.caller_count),
384 line_bin: line_bin(fp.body_line_count),
385 child_bin: child_bin(fp.child_count),
386 }
387}
388
389pub fn group_into_buckets(
391 fingerprints: &[StructuralFingerprint],
392) -> HashMap<BucketKey, Vec<&StructuralFingerprint>> {
393 let mut map: HashMap<BucketKey, Vec<&StructuralFingerprint>> = HashMap::new();
394 for fp in fingerprints {
395 map.entry(bucket_key(fp)).or_default().push(fp);
396 }
397 map
398}
399
400pub fn compare_pair(
414 source_code_a: &str,
415 source_code_b: &str,
416 cross_language: bool,
417 threshold: f64,
418) -> Option<CloneMatch> {
419 if cross_language {
420 return Some(CloneMatch {
421 source: String::new(),
422 target: String::new(),
423 similarity: 1.0,
424 clone_type: CloneType::StructuralOnly,
425 });
426 }
427
428 let tokens_a = tokenize(source_code_a);
429 let tokens_b = tokenize(source_code_b);
430
431 if tokens_a.is_empty() || tokens_b.is_empty() {
432 return None;
433 }
434
435 let raw_sim = jaccard_similarity(&tokens_a, &tokens_b);
436 if raw_sim >= 0.95 {
437 return Some(CloneMatch {
438 source: String::new(),
439 target: String::new(),
440 similarity: raw_sim,
441 clone_type: CloneType::Type1,
442 });
443 }
444
445 let norm_a = normalize_identifiers(&tokens_a);
446 let norm_b = normalize_identifiers(&tokens_b);
447 let norm_sim = jaccard_similarity(&norm_a, &norm_b);
448 if norm_sim >= threshold {
449 return Some(CloneMatch {
450 source: String::new(),
451 target: String::new(),
452 similarity: norm_sim,
453 clone_type: CloneType::Type2,
454 });
455 }
456
457 None
458}
459
460pub fn cluster_matches(matches: &[CloneMatch]) -> Vec<CloneCluster> {
463 if matches.is_empty() {
464 return Vec::new();
465 }
466
467 let mut name_to_id: HashMap<String, usize> = HashMap::new();
469 let mut id_to_name: Vec<String> = Vec::new();
470
471 for m in matches {
472 for name in [m.source.as_str(), m.target.as_str()] {
473 if !name_to_id.contains_key(name) {
474 let id = id_to_name.len();
475 name_to_id.insert(name.to_string(), id);
476 id_to_name.push(name.to_string());
477 }
478 }
479 }
480
481 let n = id_to_name.len();
482
483 let mut parent: Vec<usize> = (0..n).collect();
485 let mut rank: Vec<usize> = vec![0; n];
486
487 fn find(parent: &mut [usize], x: usize) -> usize {
488 if parent[x] != x {
489 parent[x] = find(parent, parent[x]);
490 }
491 parent[x]
492 }
493
494 fn union(parent: &mut [usize], rank: &mut [usize], x: usize, y: usize) {
495 let rx = find(parent, x);
496 let ry = find(parent, y);
497 if rx == ry {
498 return;
499 }
500 match rank[rx].cmp(&rank[ry]) {
501 std::cmp::Ordering::Less => parent[rx] = ry,
502 std::cmp::Ordering::Greater => parent[ry] = rx,
503 std::cmp::Ordering::Equal => {
504 parent[ry] = rx;
505 rank[rx] += 1;
506 }
507 }
508 }
509
510 for m in matches {
511 let a = name_to_id[m.source.as_str()];
512 let b = name_to_id[m.target.as_str()];
513 union(&mut parent, &mut rank, a, b);
514 }
515
516 let mut components: HashMap<usize, Vec<usize>> = HashMap::new();
518 for i in 0..n {
519 let root = find(&mut parent, i);
520 components.entry(root).or_default().push(i);
521 }
522
523 let mut clusters: Vec<CloneCluster> = components
525 .values()
526 .map(|member_ids| {
527 let mut members: Vec<String> = member_ids
529 .iter()
530 .map(|&id| id_to_name[id].to_string())
531 .collect();
532 members.sort();
533
534 let member_set: HashSet<&str> = members.iter().map(String::as_str).collect();
535
536 let intra: Vec<&CloneMatch> = matches
538 .iter()
539 .filter(|m| {
540 member_set.contains(m.source.as_str()) && member_set.contains(m.target.as_str())
541 })
542 .collect();
543
544 let avg_similarity = if intra.is_empty() {
545 0.0
546 } else {
547 intra.iter().map(|m| m.similarity).sum::<f64>() / intra.len() as f64
548 };
549
550 let dominant_type = if intra.iter().any(|m| m.clone_type == CloneType::Type1) {
552 CloneType::Type1
553 } else if intra.iter().any(|m| m.clone_type == CloneType::Type2) {
554 CloneType::Type2
555 } else {
556 CloneType::StructuralOnly
557 };
558
559 let representative = members[0].clone(); let intra_matches: Vec<CloneMatch> = intra.into_iter().cloned().collect();
562
563 CloneCluster {
564 id: 0, members,
566 avg_similarity,
567 clone_type: dominant_type,
568 representative,
569 intra_matches,
570 }
571 })
572 .collect();
573
574 clusters.sort_by(|a, b| {
576 b.members
577 .len()
578 .cmp(&a.members.len())
579 .then_with(|| a.representative.cmp(&b.representative))
580 });
581
582 for (i, cluster) in clusters.iter_mut().enumerate() {
584 cluster.id = i + 1;
585 }
586
587 clusters
588}
589
590#[cfg(test)]
591mod tests {
592 use super::*;
593
594 #[test]
595 fn jaccard_identical_tokens() {
596 let a = vec!["fn".to_string(), "foo".into(), "(".into(), ")".into()];
597 let b = a.clone();
598 assert!((jaccard_similarity(&a, &b) - 1.0).abs() < f64::EPSILON);
599 }
600
601 #[test]
602 fn jaccard_disjoint_tokens() {
603 let a = vec!["fn".to_string(), "foo".into()];
604 let b = vec!["class".to_string(), "Bar".into()];
605 assert!((jaccard_similarity(&a, &b) - 0.0).abs() < f64::EPSILON);
606 }
607
608 #[test]
609 fn jaccard_partial_overlap() {
610 let a = vec!["fn".to_string(), "foo".into(), "(".into()];
611 let b = vec!["fn".to_string(), "bar".into(), "(".into()];
612 assert!((jaccard_similarity(&a, &b) - 0.5).abs() < f64::EPSILON);
614 }
615
616 #[test]
617 fn normalize_replaces_identifiers_with_positional_placeholders() {
618 let tokens = vec![
619 "fn".to_string(),
620 "add".into(),
621 "(".into(),
622 "a".into(),
623 ",".into(),
624 "b".into(),
625 ")".into(),
626 "{".into(),
627 "a".into(),
628 "+".into(),
629 "b".into(),
630 "}".into(),
631 ];
632 let normalized = normalize_identifiers(&tokens);
633 assert_eq!(normalized[0], "fn"); assert_eq!(normalized[1], "_1"); assert_eq!(normalized[3], "_2"); assert_eq!(normalized[5], "_3"); assert_eq!(normalized[8], "_2"); assert_eq!(normalized[10], "_3"); }
640
641 #[test]
642 fn type2_clones_detected_after_normalization() {
643 let tokens_a = vec![
644 "fn".to_string(),
645 "add".into(),
646 "(".into(),
647 "x".into(),
648 ",".into(),
649 "y".into(),
650 ")".into(),
651 "{".into(),
652 "x".into(),
653 "+".into(),
654 "y".into(),
655 "}".into(),
656 ];
657 let tokens_b = vec![
658 "fn".to_string(),
659 "sum".into(),
660 "(".into(),
661 "a".into(),
662 ",".into(),
663 "b".into(),
664 ")".into(),
665 "{".into(),
666 "a".into(),
667 "+".into(),
668 "b".into(),
669 "}".into(),
670 ];
671 let raw_score = jaccard_similarity(&tokens_a, &tokens_b);
672 assert!(raw_score < 0.95);
673 let norm_a = normalize_identifiers(&tokens_a);
674 let norm_b = normalize_identifiers(&tokens_b);
675 let norm_score = jaccard_similarity(&norm_a, &norm_b);
676 assert!((norm_score - 1.0).abs() < f64::EPSILON);
677 }
678
679 #[test]
680 fn tokenize_strips_comments_and_splits() {
681 let source = "fn foo() { // comment\n let x = 1;\n}";
682 let tokens = tokenize(source);
683 assert!(!tokens.contains(&"comment".to_string()));
684 assert!(tokens.contains(&"fn".to_string()));
685 assert!(tokens.contains(&"foo".to_string()));
686 }
687
688 #[test]
689 fn tokenize_empty_source() {
690 assert!(tokenize("").is_empty());
691 }
692
693 #[test]
694 fn jaccard_both_empty() {
695 assert!((jaccard_similarity(&[], &[]) - 1.0).abs() < f64::EPSILON);
696 }
697
698 use crate::model::{Language, SymbolKind};
703 use std::path::PathBuf;
704
705 #[test]
706 fn bucket_key_groups_similar_symbols() {
707 let fp1 = StructuralFingerprint {
708 qualified_name: "a::foo".into(),
709 symbol_kind: SymbolKind::Function,
710 callee_count: 2,
711 caller_count: 1,
712 edge_kind_set: 0,
713 body_line_count: 10,
714 child_count: 0,
715 language: Language::Rust,
716 file: PathBuf::from("a.rs"),
717 line_start: 1,
718 line_end: 10,
719 };
720 let fp2 = StructuralFingerprint {
721 qualified_name: "b::bar".into(),
722 symbol_kind: SymbolKind::Function,
723 callee_count: 1,
724 caller_count: 1,
725 edge_kind_set: 0,
726 body_line_count: 10,
727 child_count: 0,
728 language: Language::Rust,
729 file: PathBuf::from("b.rs"),
730 line_start: 1,
731 line_end: 10,
732 };
733 assert_eq!(bucket_key(&fp1), bucket_key(&fp2)); }
735
736 #[test]
737 fn bucket_key_separates_different_kinds() {
738 let fp1 = StructuralFingerprint {
739 qualified_name: "a::foo".into(),
740 symbol_kind: SymbolKind::Function,
741 callee_count: 0,
742 caller_count: 0,
743 edge_kind_set: 0,
744 body_line_count: 10,
745 child_count: 0,
746 language: Language::Rust,
747 file: PathBuf::from("a.rs"),
748 line_start: 1,
749 line_end: 10,
750 };
751 let fp2 = StructuralFingerprint {
752 qualified_name: "a::bar".into(),
753 symbol_kind: SymbolKind::Method,
754 callee_count: 0,
755 caller_count: 0,
756 edge_kind_set: 0,
757 body_line_count: 10,
758 child_count: 0,
759 language: Language::Rust,
760 file: PathBuf::from("a.rs"),
761 line_start: 1,
762 line_end: 10,
763 };
764 assert_ne!(bucket_key(&fp1), bucket_key(&fp2));
765 }
766
767 #[test]
768 fn group_into_buckets_correct_grouping() {
769 let fps = vec![
770 StructuralFingerprint {
771 qualified_name: "a::f1".into(),
772 symbol_kind: SymbolKind::Function,
773 callee_count: 1,
774 caller_count: 0,
775 edge_kind_set: 0,
776 body_line_count: 10,
777 child_count: 0,
778 language: Language::Rust,
779 file: PathBuf::from("a.rs"),
780 line_start: 1,
781 line_end: 10,
782 },
783 StructuralFingerprint {
784 qualified_name: "b::f2".into(),
785 symbol_kind: SymbolKind::Function,
786 callee_count: 2,
787 caller_count: 0,
788 edge_kind_set: 0,
789 body_line_count: 12,
790 child_count: 0,
791 language: Language::Rust,
792 file: PathBuf::from("b.rs"),
793 line_start: 1,
794 line_end: 12,
795 },
796 StructuralFingerprint {
797 qualified_name: "c::f3".into(),
798 symbol_kind: SymbolKind::Class,
799 callee_count: 1,
800 caller_count: 0,
801 edge_kind_set: 0,
802 body_line_count: 10,
803 child_count: 0,
804 language: Language::Rust,
805 file: PathBuf::from("c.rs"),
806 line_start: 1,
807 line_end: 10,
808 },
809 ];
810 let buckets = group_into_buckets(&fps);
811 assert_eq!(buckets.len(), 2); let max_bucket = buckets.values().map(|v| v.len()).max().unwrap();
813 assert_eq!(max_bucket, 2);
814 }
815
816 #[test]
817 fn count_bin_boundaries() {
818 assert_eq!(count_bin(0), 0);
819 assert_eq!(count_bin(1), 1);
820 assert_eq!(count_bin(2), 1);
821 assert_eq!(count_bin(3), 2);
822 assert_eq!(count_bin(5), 2);
823 assert_eq!(count_bin(6), 3);
824 assert_eq!(count_bin(100), 3);
825 }
826
827 use crate::model::{Location, Visibility};
832
833 fn make_symbol(
834 name: &str,
835 kind: SymbolKind,
836 line_start: usize,
837 line_end: usize,
838 ) -> crate::model::SymbolNode {
839 crate::model::SymbolNode {
840 name: name.to_string(),
841 qualified_name: format!("test.rs::{name}"),
842 kind,
843 location: Location {
844 file: PathBuf::from("test.rs"),
845 line_start,
846 line_end,
847 col_start: 0,
848 col_end: 0,
849 },
850 visibility: Visibility::Public,
851 is_exported: false,
852 is_async: false,
853 is_test: false,
854 decorators: vec![],
855 signature: None,
856 }
857 }
858
859 fn make_edge(source: &str, target: &str, kind: EdgeKind) -> Edge {
860 Edge {
861 kind,
862 source: format!("test.rs::{source}"),
863 target: format!("test.rs::{target}"),
864 metadata: None,
865 }
866 }
867
868 #[test]
869 fn fingerprint_captures_symbol_structure() {
870 let symbols = vec![make_symbol("foo", SymbolKind::Function, 1, 20)];
871 let edges = vec![
872 make_edge("foo", "bar", EdgeKind::Calls),
873 make_edge("foo", "baz", EdgeKind::Calls),
874 make_edge("qux", "foo", EdgeKind::Calls),
875 ];
876 let config = CloneConfig {
877 min_lines: 1,
878 ..CloneConfig::default()
879 };
880
881 let fps = compute_fingerprints(&symbols, &edges, &config);
882
883 assert_eq!(fps.len(), 1);
884 let fp = &fps[0];
885 assert_eq!(fp.callee_count, 2, "should count 2 outgoing Calls edges");
886 assert_eq!(fp.caller_count, 1, "should count 1 incoming Calls edge");
887 assert_eq!(fp.body_line_count, 20, "line_end - line_start + 1 = 20");
888 assert_eq!(fp.child_count, 0, "no Contains edges");
889 }
890
891 #[test]
892 fn fingerprint_filters_by_min_lines() {
893 let symbols = vec![make_symbol("tiny", SymbolKind::Function, 1, 3)];
895 let edges = vec![];
896 let config = CloneConfig::default(); let fps = compute_fingerprints(&symbols, &edges, &config);
899
900 assert_eq!(fps.len(), 0, "symbol with 3 lines should be filtered out");
901 }
902
903 #[test]
904 fn fingerprint_counts_contains_edges_as_children() {
905 let symbols = vec![make_symbol("MyClass", SymbolKind::Class, 1, 50)];
906 let edges = vec![
907 make_edge("MyClass", "method_a", EdgeKind::Contains),
908 make_edge("MyClass", "method_b", EdgeKind::Contains),
909 make_edge("MyClass", "field_c", EdgeKind::Contains),
910 ];
911 let config = CloneConfig {
912 min_lines: 1,
913 ..CloneConfig::default()
914 };
915
916 let fps = compute_fingerprints(&symbols, &edges, &config);
917
918 assert_eq!(fps.len(), 1);
919 let fp = &fps[0];
920 assert_eq!(
921 fp.child_count, 3,
922 "should count 3 Contains edges as children"
923 );
924 assert_eq!(
925 fp.callee_count, 0,
926 "Contains has Structural confidence, not High"
927 );
928 }
929
930 #[test]
931 fn fingerprint_empty_input() {
932 let fps = compute_fingerprints(&[], &[], &CloneConfig::default());
933 assert_eq!(fps.len(), 0);
934 }
935
936 #[test]
937 fn fingerprint_edge_kind_bitset() {
938 let symbols = vec![make_symbol("node", SymbolKind::Function, 1, 10)];
939 let edges = vec![
940 make_edge("node", "a", EdgeKind::Calls),
941 make_edge("node", "b", EdgeKind::Extends),
942 make_edge("node", "c", EdgeKind::Contains),
943 ];
944 let config = CloneConfig {
945 min_lines: 1,
946 ..CloneConfig::default()
947 };
948
949 let fps = compute_fingerprints(&symbols, &edges, &config);
950 assert_eq!(fps.len(), 1);
951
952 let fp = &fps[0];
953 let calls_bit = 1u32 << edge_kind_index(&EdgeKind::Calls);
954 let extends_bit = 1u32 << edge_kind_index(&EdgeKind::Extends);
955 let contains_bit = 1u32 << edge_kind_index(&EdgeKind::Contains);
956
957 assert_ne!(fp.edge_kind_set & calls_bit, 0, "Calls bit should be set");
958 assert_ne!(
959 fp.edge_kind_set & extends_bit,
960 0,
961 "Extends bit should be set"
962 );
963 assert_ne!(
964 fp.edge_kind_set & contains_bit,
965 0,
966 "Contains bit should be set"
967 );
968
969 let imports_bit = 1u32 << edge_kind_index(&EdgeKind::ImportsFrom);
971 assert_eq!(
972 fp.edge_kind_set & imports_bit,
973 0,
974 "ImportsFrom bit should NOT be set"
975 );
976 }
977
978 use crate::model::{CloneMatch, CloneType};
983
984 #[test]
985 fn compare_pair_type1_exact_match() {
986 let src = "fn foo(x: i32, y: i32) -> i32 { x + y }";
987 let result = compare_pair(src, src, false, 0.7);
988 assert!(result.is_some());
989 let m = result.unwrap();
990 assert_eq!(m.clone_type, CloneType::Type1);
991 assert!(m.similarity >= 0.95);
992 }
993
994 #[test]
995 fn compare_pair_type2_renamed_vars() {
996 let src_a = "fn add(x: i32, y: i32) -> i32 { x + y }";
997 let src_b = "fn sum(a: i32, b: i32) -> i32 { a + b }";
998 let result = compare_pair(src_a, src_b, false, 0.7);
999 assert!(result.is_some());
1000 assert_eq!(result.unwrap().clone_type, CloneType::Type2);
1001 }
1002
1003 #[test]
1004 fn compare_pair_cross_language_structural_only() {
1005 let result = compare_pair("", "", true, 0.7);
1006 assert!(result.is_some());
1007 assert_eq!(result.unwrap().clone_type, CloneType::StructuralOnly);
1008 }
1009
1010 #[test]
1011 fn compare_pair_below_threshold_returns_none() {
1012 let src_a = "fn add(x: i32) -> i32 { x + 1 }";
1013 let src_b = "class Foo { bar: string; baz(): void { console.log(this.bar); } }";
1014 let result = compare_pair(src_a, src_b, false, 0.7);
1015 assert!(result.is_none());
1016 }
1017
1018 #[test]
1019 fn cluster_transitive() {
1020 let matches = vec![
1021 CloneMatch {
1022 source: "a".into(),
1023 target: "b".into(),
1024 similarity: 0.8,
1025 clone_type: CloneType::Type2,
1026 },
1027 CloneMatch {
1028 source: "b".into(),
1029 target: "c".into(),
1030 similarity: 0.75,
1031 clone_type: CloneType::Type2,
1032 },
1033 ];
1034 let clusters = cluster_matches(&matches);
1035 assert_eq!(clusters.len(), 1);
1036 assert_eq!(clusters[0].members.len(), 3);
1037 assert_eq!(clusters[0].id, 1);
1038 }
1039
1040 #[test]
1041 fn cluster_separate_components() {
1042 let matches = vec![
1043 CloneMatch {
1044 source: "a".into(),
1045 target: "b".into(),
1046 similarity: 0.8,
1047 clone_type: CloneType::Type1,
1048 },
1049 CloneMatch {
1050 source: "c".into(),
1051 target: "d".into(),
1052 similarity: 0.9,
1053 clone_type: CloneType::Type1,
1054 },
1055 ];
1056 let clusters = cluster_matches(&matches);
1057 assert_eq!(clusters.len(), 2);
1058 }
1059
1060 #[test]
1061 fn cluster_empty_matches() {
1062 assert!(cluster_matches(&[]).is_empty());
1063 }
1064
1065 #[test]
1066 fn cluster_ids_descending_by_size() {
1067 let matches = vec![
1068 CloneMatch {
1069 source: "a".into(),
1070 target: "b".into(),
1071 similarity: 0.8,
1072 clone_type: CloneType::Type2,
1073 },
1074 CloneMatch {
1075 source: "c".into(),
1076 target: "d".into(),
1077 similarity: 0.9,
1078 clone_type: CloneType::Type1,
1079 },
1080 CloneMatch {
1081 source: "c".into(),
1082 target: "e".into(),
1083 similarity: 0.85,
1084 clone_type: CloneType::Type1,
1085 },
1086 ];
1087 let clusters = cluster_matches(&matches);
1088 assert_eq!(clusters[0].members.len(), 3); assert_eq!(clusters[0].id, 1);
1090 assert_eq!(clusters[1].members.len(), 2); assert_eq!(clusters[1].id, 2);
1092 }
1093}