Skip to main content

the_code_graph_domain/analysis/
clones.rs

1use crate::model::{
2    BucketKey, CloneCluster, CloneConfig, CloneMatch, CloneType, Confidence, Edge, EdgeKind,
3    Language, StructuralFingerprint, SymbolNode,
4};
5use std::collections::{HashMap, HashSet};
6
7// ---------------------------------------------------------------------------
8// T03 — Structural fingerprinting
9// ---------------------------------------------------------------------------
10
11/// Maps each `EdgeKind` variant to a unique bit index (0..15).
12fn edge_kind_index(kind: &EdgeKind) -> u32 {
13    match kind {
14        EdgeKind::Contains => 0,
15        EdgeKind::ChildOf => 1,
16        EdgeKind::Calls => 2,
17        EdgeKind::ImportsFrom => 3,
18        EdgeKind::Extends => 4,
19        EdgeKind::Implements => 5,
20        EdgeKind::TestedBy => 6,
21        EdgeKind::DependsOn => 7,
22        EdgeKind::BarrelReExportAll => 8,
23        EdgeKind::ConditionalImport => 9,
24        EdgeKind::SideEffectImport => 10,
25        EdgeKind::DotImport => 11,
26        EdgeKind::HasDecorator => 12,
27        EdgeKind::Embeds => 13,
28        EdgeKind::TypeReference => 14,
29        EdgeKind::ReExport => 15,
30    }
31}
32
33/// Compute structural fingerprints for all symbols that pass the `min_lines`
34/// filter in `config`.
35///
36/// Complexity: O(S + E) — one pass to build adjacency maps, one pass over
37/// symbols.
38pub fn compute_fingerprints(
39    symbols: &[SymbolNode],
40    edges: &[Edge],
41    config: &CloneConfig,
42) -> Vec<StructuralFingerprint> {
43    // Build outgoing adjacency map: source -> Vec<&Edge>
44    let mut outgoing: HashMap<&str, Vec<&Edge>> = HashMap::new();
45    // Build incoming adjacency map: target -> Vec<&Edge>
46    let mut incoming: HashMap<&str, Vec<&Edge>> = HashMap::new();
47
48    for edge in edges {
49        outgoing.entry(edge.source.as_str()).or_default().push(edge);
50        incoming.entry(edge.target.as_str()).or_default().push(edge);
51    }
52
53    let mut fingerprints = Vec::with_capacity(symbols.len());
54
55    for sym in symbols {
56        let body_line_count = sym
57            .location
58            .line_end
59            .saturating_sub(sym.location.line_start)
60            + 1;
61
62        if body_line_count < config.min_lines {
63            continue;
64        }
65
66        let empty = Vec::new();
67        let out_edges = outgoing.get(sym.qualified_name.as_str()).unwrap_or(&empty);
68        let in_edges = incoming.get(sym.qualified_name.as_str()).unwrap_or(&empty);
69
70        // Callee count: outgoing edges with confidence >= High
71        let callee_count = out_edges
72            .iter()
73            .filter(|e| e.kind.confidence() >= Confidence::High)
74            .count();
75
76        // Caller count: incoming edges with confidence >= High
77        let caller_count = in_edges
78            .iter()
79            .filter(|e| e.kind.confidence() >= Confidence::High)
80            .count();
81
82        // Child count: outgoing Contains edges
83        let child_count = out_edges
84            .iter()
85            .filter(|e| e.kind == EdgeKind::Contains)
86            .count();
87
88        // Edge kind bitset: one bit per EdgeKind present in outgoing edges
89        let mut edge_kind_set: u32 = 0;
90        for e in out_edges.iter() {
91            edge_kind_set |= 1u32 << edge_kind_index(&e.kind);
92        }
93
94        let language = Language::from_path(&sym.location.file);
95
96        fingerprints.push(StructuralFingerprint {
97            qualified_name: sym.qualified_name.clone(),
98            symbol_kind: sym.kind,
99            callee_count,
100            caller_count,
101            edge_kind_set,
102            body_line_count,
103            child_count,
104            language,
105            file: sym.location.file.clone(),
106            line_start: sym.location.line_start,
107            line_end: sym.location.line_end,
108        });
109    }
110
111    fingerprints
112}
113
114/// Split source code into tokens.
115/// - Strips line comments (`//` and `#`)
116/// - Splits on whitespace
117/// - Splits tokens further on punctuation boundaries (keeps punctuation as separate tokens)
118/// - Skips empty tokens
119pub fn tokenize(source: &str) -> Vec<String> {
120    let mut tokens = Vec::new();
121
122    for line in source.lines() {
123        // Strip line comments: // and #
124        let stripped = if let Some(pos) = line.find("//") {
125            &line[..pos]
126        } else if let Some(pos) = line.find('#') {
127            &line[..pos]
128        } else {
129            line
130        };
131
132        for word in stripped.split_whitespace() {
133            // Split word on punctuation boundaries, keeping punctuation as separate tokens
134            split_on_punctuation(word, &mut tokens);
135        }
136    }
137
138    tokens
139}
140
141/// Split a word on punctuation boundaries, keeping punctuation as separate tokens.
142fn split_on_punctuation(word: &str, out: &mut Vec<String>) {
143    let mut current = String::new();
144
145    for ch in word.chars() {
146        if ch.is_ascii_punctuation() {
147            // Flush current alphanumeric token
148            if !current.is_empty() {
149                out.push(current.clone());
150                current.clear();
151            }
152            // Push punctuation as its own token
153            out.push(ch.to_string());
154        } else {
155            current.push(ch);
156        }
157    }
158
159    if !current.is_empty() {
160        out.push(current);
161    }
162}
163
164/// Replace identifiers with positional placeholders.
165/// - Keywords, operators/punctuation, and numeric literals are kept as-is.
166/// - Identifiers are replaced with `_1`, `_2`, etc. (same identifier → same placeholder everywhere).
167pub fn normalize_identifiers(tokens: &[String]) -> Vec<String> {
168    let keywords: HashSet<&str> = [
169        // Rust
170        "fn",
171        "let",
172        "mut",
173        "const",
174        "struct",
175        "enum",
176        "impl",
177        "trait",
178        "pub",
179        "use",
180        "mod",
181        "if",
182        "else",
183        "match",
184        "for",
185        "while",
186        "loop",
187        "return",
188        "break",
189        "continue",
190        "where",
191        "async",
192        "await",
193        "move",
194        "ref",
195        "type",
196        "self",
197        "super",
198        "crate",
199        // TypeScript/JS
200        "function",
201        "var",
202        "class",
203        "interface",
204        "export",
205        "import",
206        "from",
207        "default",
208        "extends",
209        "implements",
210        "new",
211        "this",
212        "typeof",
213        "instanceof",
214        "void",
215        "null",
216        "undefined",
217        "true",
218        "false",
219        "try",
220        "catch",
221        "throw",
222        "finally",
223        // Python
224        "def",
225        "class",
226        "import",
227        "from",
228        "return",
229        "if",
230        "elif",
231        "else",
232        "for",
233        "while",
234        "with",
235        "as",
236        "try",
237        "except",
238        "raise",
239        "pass",
240        "None",
241        "True",
242        "False",
243        "lambda",
244        // Go
245        "func",
246        "package",
247        "import",
248        "type",
249        "struct",
250        "interface",
251        "map",
252        "chan",
253        "go",
254        "defer",
255        "select",
256        "case",
257        "range",
258        "nil",
259    ]
260    .iter()
261    .copied()
262    .collect();
263
264    let mut identifier_map: HashMap<String, usize> = HashMap::new();
265    let mut result = Vec::with_capacity(tokens.len());
266
267    for token in tokens {
268        // Keep keywords as-is
269        if keywords.contains(token.as_str()) {
270            result.push(token.clone());
271            continue;
272        }
273
274        // Keep purely punctuation tokens as-is (single or multi-char operator/punct)
275        if token.chars().all(|c| c.is_ascii_punctuation()) {
276            result.push(token.clone());
277            continue;
278        }
279
280        // Keep numeric literals as-is
281        if token
282            .chars()
283            .next()
284            .map(|c| c.is_ascii_digit())
285            .unwrap_or(false)
286        {
287            result.push(token.clone());
288            continue;
289        }
290
291        // It's an identifier — replace with positional placeholder
292        let next_id = identifier_map.len() + 1;
293        let idx = identifier_map.entry(token.clone()).or_insert(next_id);
294        result.push(format!("_{}", idx));
295    }
296
297    result
298}
299
300/// Jaccard similarity on token multisets.
301/// - intersection = sum of min(count_a, count_b) for each token
302/// - union = sum of max(count_a, count_b) for each token
303/// - Returns intersection / union (both empty → 1.0, one empty → 0.0)
304pub fn jaccard_similarity(a: &[String], b: &[String]) -> f64 {
305    if a.is_empty() && b.is_empty() {
306        return 1.0;
307    }
308    if a.is_empty() || b.is_empty() {
309        return 0.0;
310    }
311
312    let mut counts_a: HashMap<&str, usize> = HashMap::new();
313    let mut counts_b: HashMap<&str, usize> = HashMap::new();
314
315    for token in a {
316        *counts_a.entry(token.as_str()).or_insert(0) += 1;
317    }
318    for token in b {
319        *counts_b.entry(token.as_str()).or_insert(0) += 1;
320    }
321
322    // Collect all unique tokens
323    let all_tokens: HashSet<&str> = counts_a.keys().chain(counts_b.keys()).copied().collect();
324
325    let mut intersection = 0usize;
326    let mut union = 0usize;
327
328    for token in &all_tokens {
329        let ca = counts_a.get(token).copied().unwrap_or(0);
330        let cb = counts_b.get(token).copied().unwrap_or(0);
331        intersection += ca.min(cb);
332        union += ca.max(cb);
333    }
334
335    if union == 0 {
336        return 1.0;
337    }
338
339    intersection as f64 / union as f64
340}
341
342// ---------------------------------------------------------------------------
343// T04 — Quantize-and-concatenate bucketing
344// ---------------------------------------------------------------------------
345
346/// Quantize a call count into a 2-bit bin.
347/// 0 → 0, 1–2 → 1, 3–5 → 2, 6+ → 3
348fn count_bin(count: usize) -> u8 {
349    match count {
350        0 => 0,
351        1..=2 => 1,
352        3..=5 => 2,
353        _ => 3,
354    }
355}
356
357/// Quantize a line count into a 2-bit bin.
358/// 0–5 → 0, 6–15 → 1, 16–50 → 2, 51+ → 3
359fn line_bin(lines: usize) -> u8 {
360    match lines {
361        0..=5 => 0,
362        6..=15 => 1,
363        16..=50 => 2,
364        _ => 3,
365    }
366}
367
368/// Quantize a child count into a 2-bit bin.
369/// 0 → 0, 1–3 → 1, 4+ → 2
370fn child_bin(count: usize) -> u8 {
371    match count {
372        0 => 0,
373        1..=3 => 1,
374        _ => 2,
375    }
376}
377
378/// Compute the coarse bucket key for a structural fingerprint.
379pub fn bucket_key(fp: &StructuralFingerprint) -> BucketKey {
380    BucketKey {
381        kind: fp.symbol_kind,
382        callee_bin: count_bin(fp.callee_count),
383        caller_bin: count_bin(fp.caller_count),
384        line_bin: line_bin(fp.body_line_count),
385        child_bin: child_bin(fp.child_count),
386    }
387}
388
389/// Group a slice of fingerprints by their bucket key.
390pub fn group_into_buckets(
391    fingerprints: &[StructuralFingerprint],
392) -> HashMap<BucketKey, Vec<&StructuralFingerprint>> {
393    let mut map: HashMap<BucketKey, Vec<&StructuralFingerprint>> = HashMap::new();
394    for fp in fingerprints {
395        map.entry(bucket_key(fp)).or_default().push(fp);
396    }
397    map
398}
399
400// ---------------------------------------------------------------------------
401// T06 — Candidate comparison and connected-component clustering
402// ---------------------------------------------------------------------------
403
404/// Compare two source code snippets and return a `CloneMatch` if they are
405/// similar enough, or `None` otherwise.
406///
407/// - `cross_language`: structural-only path — immediately returns a
408///   `StructuralOnly` match with similarity 1.0.
409/// - Otherwise: raw Jaccard ≥ 0.95 → `Type1`; normalized Jaccard ≥ threshold
410///   → `Type2`; below threshold → `None`.
411/// - The `source` and `target` fields of the returned match are left empty;
412///   the caller is expected to fill them in.
413pub fn compare_pair(
414    source_code_a: &str,
415    source_code_b: &str,
416    cross_language: bool,
417    threshold: f64,
418) -> Option<CloneMatch> {
419    if cross_language {
420        return Some(CloneMatch {
421            source: String::new(),
422            target: String::new(),
423            similarity: 1.0,
424            clone_type: CloneType::StructuralOnly,
425        });
426    }
427
428    let tokens_a = tokenize(source_code_a);
429    let tokens_b = tokenize(source_code_b);
430
431    if tokens_a.is_empty() || tokens_b.is_empty() {
432        return None;
433    }
434
435    let raw_sim = jaccard_similarity(&tokens_a, &tokens_b);
436    if raw_sim >= 0.95 {
437        return Some(CloneMatch {
438            source: String::new(),
439            target: String::new(),
440            similarity: raw_sim,
441            clone_type: CloneType::Type1,
442        });
443    }
444
445    let norm_a = normalize_identifiers(&tokens_a);
446    let norm_b = normalize_identifiers(&tokens_b);
447    let norm_sim = jaccard_similarity(&norm_a, &norm_b);
448    if norm_sim >= threshold {
449        return Some(CloneMatch {
450            source: String::new(),
451            target: String::new(),
452            similarity: norm_sim,
453            clone_type: CloneType::Type2,
454        });
455    }
456
457    None
458}
459
460/// Cluster a slice of `CloneMatch` records into connected components using
461/// Union-Find, then return a sorted `Vec<CloneCluster>` (largest first).
462pub fn cluster_matches(matches: &[CloneMatch]) -> Vec<CloneCluster> {
463    if matches.is_empty() {
464        return Vec::new();
465    }
466
467    // Collect all unique node names and assign integer IDs.
468    let mut name_to_id: HashMap<String, usize> = HashMap::new();
469    let mut id_to_name: Vec<String> = Vec::new();
470
471    for m in matches {
472        for name in [m.source.as_str(), m.target.as_str()] {
473            if !name_to_id.contains_key(name) {
474                let id = id_to_name.len();
475                name_to_id.insert(name.to_string(), id);
476                id_to_name.push(name.to_string());
477            }
478        }
479    }
480
481    let n = id_to_name.len();
482
483    // Union-Find with path compression and union by rank.
484    let mut parent: Vec<usize> = (0..n).collect();
485    let mut rank: Vec<usize> = vec![0; n];
486
487    fn find(parent: &mut [usize], x: usize) -> usize {
488        if parent[x] != x {
489            parent[x] = find(parent, parent[x]);
490        }
491        parent[x]
492    }
493
494    fn union(parent: &mut [usize], rank: &mut [usize], x: usize, y: usize) {
495        let rx = find(parent, x);
496        let ry = find(parent, y);
497        if rx == ry {
498            return;
499        }
500        match rank[rx].cmp(&rank[ry]) {
501            std::cmp::Ordering::Less => parent[rx] = ry,
502            std::cmp::Ordering::Greater => parent[ry] = rx,
503            std::cmp::Ordering::Equal => {
504                parent[ry] = rx;
505                rank[rx] += 1;
506            }
507        }
508    }
509
510    for m in matches {
511        let a = name_to_id[m.source.as_str()];
512        let b = name_to_id[m.target.as_str()];
513        union(&mut parent, &mut rank, a, b);
514    }
515
516    // Group nodes by root.
517    let mut components: HashMap<usize, Vec<usize>> = HashMap::new();
518    for i in 0..n {
519        let root = find(&mut parent, i);
520        components.entry(root).or_default().push(i);
521    }
522
523    // Build one CloneCluster per component.
524    let mut clusters: Vec<CloneCluster> = components
525        .values()
526        .map(|member_ids| {
527            // Collect member names, sorted alphabetically.
528            let mut members: Vec<String> = member_ids
529                .iter()
530                .map(|&id| id_to_name[id].to_string())
531                .collect();
532            members.sort();
533
534            let member_set: HashSet<&str> = members.iter().map(String::as_str).collect();
535
536            // Intra-cluster matches: both endpoints are in this component.
537            let intra: Vec<&CloneMatch> = matches
538                .iter()
539                .filter(|m| {
540                    member_set.contains(m.source.as_str()) && member_set.contains(m.target.as_str())
541                })
542                .collect();
543
544            let avg_similarity = if intra.is_empty() {
545                0.0
546            } else {
547                intra.iter().map(|m| m.similarity).sum::<f64>() / intra.len() as f64
548            };
549
550            // Dominant clone type: Type1 > Type2 > StructuralOnly.
551            let dominant_type = if intra.iter().any(|m| m.clone_type == CloneType::Type1) {
552                CloneType::Type1
553            } else if intra.iter().any(|m| m.clone_type == CloneType::Type2) {
554                CloneType::Type2
555            } else {
556                CloneType::StructuralOnly
557            };
558
559            let representative = members[0].clone(); // first alphabetically
560
561            let intra_matches: Vec<CloneMatch> = intra.into_iter().cloned().collect();
562
563            CloneCluster {
564                id: 0, // assigned after sorting
565                members,
566                avg_similarity,
567                clone_type: dominant_type,
568                representative,
569                intra_matches,
570            }
571        })
572        .collect();
573
574    // Sort: largest first; break ties by representative name.
575    clusters.sort_by(|a, b| {
576        b.members
577            .len()
578            .cmp(&a.members.len())
579            .then_with(|| a.representative.cmp(&b.representative))
580    });
581
582    // Assign 1-indexed IDs.
583    for (i, cluster) in clusters.iter_mut().enumerate() {
584        cluster.id = i + 1;
585    }
586
587    clusters
588}
589
590#[cfg(test)]
591mod tests {
592    use super::*;
593
594    #[test]
595    fn jaccard_identical_tokens() {
596        let a = vec!["fn".to_string(), "foo".into(), "(".into(), ")".into()];
597        let b = a.clone();
598        assert!((jaccard_similarity(&a, &b) - 1.0).abs() < f64::EPSILON);
599    }
600
601    #[test]
602    fn jaccard_disjoint_tokens() {
603        let a = vec!["fn".to_string(), "foo".into()];
604        let b = vec!["class".to_string(), "Bar".into()];
605        assert!((jaccard_similarity(&a, &b) - 0.0).abs() < f64::EPSILON);
606    }
607
608    #[test]
609    fn jaccard_partial_overlap() {
610        let a = vec!["fn".to_string(), "foo".into(), "(".into()];
611        let b = vec!["fn".to_string(), "bar".into(), "(".into()];
612        // intersection: fn, ( = 2; union: fn, foo, bar, ( = 4; jaccard = 0.5
613        assert!((jaccard_similarity(&a, &b) - 0.5).abs() < f64::EPSILON);
614    }
615
616    #[test]
617    fn normalize_replaces_identifiers_with_positional_placeholders() {
618        let tokens = vec![
619            "fn".to_string(),
620            "add".into(),
621            "(".into(),
622            "a".into(),
623            ",".into(),
624            "b".into(),
625            ")".into(),
626            "{".into(),
627            "a".into(),
628            "+".into(),
629            "b".into(),
630            "}".into(),
631        ];
632        let normalized = normalize_identifiers(&tokens);
633        assert_eq!(normalized[0], "fn"); // keyword kept
634        assert_eq!(normalized[1], "_1"); // add -> _1
635        assert_eq!(normalized[3], "_2"); // a -> _2
636        assert_eq!(normalized[5], "_3"); // b -> _3
637        assert_eq!(normalized[8], "_2"); // a again -> _2
638        assert_eq!(normalized[10], "_3"); // b again -> _3
639    }
640
641    #[test]
642    fn type2_clones_detected_after_normalization() {
643        let tokens_a = vec![
644            "fn".to_string(),
645            "add".into(),
646            "(".into(),
647            "x".into(),
648            ",".into(),
649            "y".into(),
650            ")".into(),
651            "{".into(),
652            "x".into(),
653            "+".into(),
654            "y".into(),
655            "}".into(),
656        ];
657        let tokens_b = vec![
658            "fn".to_string(),
659            "sum".into(),
660            "(".into(),
661            "a".into(),
662            ",".into(),
663            "b".into(),
664            ")".into(),
665            "{".into(),
666            "a".into(),
667            "+".into(),
668            "b".into(),
669            "}".into(),
670        ];
671        let raw_score = jaccard_similarity(&tokens_a, &tokens_b);
672        assert!(raw_score < 0.95);
673        let norm_a = normalize_identifiers(&tokens_a);
674        let norm_b = normalize_identifiers(&tokens_b);
675        let norm_score = jaccard_similarity(&norm_a, &norm_b);
676        assert!((norm_score - 1.0).abs() < f64::EPSILON);
677    }
678
679    #[test]
680    fn tokenize_strips_comments_and_splits() {
681        let source = "fn foo() { // comment\n  let x = 1;\n}";
682        let tokens = tokenize(source);
683        assert!(!tokens.contains(&"comment".to_string()));
684        assert!(tokens.contains(&"fn".to_string()));
685        assert!(tokens.contains(&"foo".to_string()));
686    }
687
688    #[test]
689    fn tokenize_empty_source() {
690        assert!(tokenize("").is_empty());
691    }
692
693    #[test]
694    fn jaccard_both_empty() {
695        assert!((jaccard_similarity(&[], &[]) - 1.0).abs() < f64::EPSILON);
696    }
697
698    // -----------------------------------------------------------------------
699    // T04 bucketing tests
700    // -----------------------------------------------------------------------
701
702    use crate::model::{Language, SymbolKind};
703    use std::path::PathBuf;
704
705    #[test]
706    fn bucket_key_groups_similar_symbols() {
707        let fp1 = StructuralFingerprint {
708            qualified_name: "a::foo".into(),
709            symbol_kind: SymbolKind::Function,
710            callee_count: 2,
711            caller_count: 1,
712            edge_kind_set: 0,
713            body_line_count: 10,
714            child_count: 0,
715            language: Language::Rust,
716            file: PathBuf::from("a.rs"),
717            line_start: 1,
718            line_end: 10,
719        };
720        let fp2 = StructuralFingerprint {
721            qualified_name: "b::bar".into(),
722            symbol_kind: SymbolKind::Function,
723            callee_count: 1,
724            caller_count: 1,
725            edge_kind_set: 0,
726            body_line_count: 10,
727            child_count: 0,
728            language: Language::Rust,
729            file: PathBuf::from("b.rs"),
730            line_start: 1,
731            line_end: 10,
732        };
733        assert_eq!(bucket_key(&fp1), bucket_key(&fp2)); // both callee in bin 1 (1-2)
734    }
735
736    #[test]
737    fn bucket_key_separates_different_kinds() {
738        let fp1 = StructuralFingerprint {
739            qualified_name: "a::foo".into(),
740            symbol_kind: SymbolKind::Function,
741            callee_count: 0,
742            caller_count: 0,
743            edge_kind_set: 0,
744            body_line_count: 10,
745            child_count: 0,
746            language: Language::Rust,
747            file: PathBuf::from("a.rs"),
748            line_start: 1,
749            line_end: 10,
750        };
751        let fp2 = StructuralFingerprint {
752            qualified_name: "a::bar".into(),
753            symbol_kind: SymbolKind::Method,
754            callee_count: 0,
755            caller_count: 0,
756            edge_kind_set: 0,
757            body_line_count: 10,
758            child_count: 0,
759            language: Language::Rust,
760            file: PathBuf::from("a.rs"),
761            line_start: 1,
762            line_end: 10,
763        };
764        assert_ne!(bucket_key(&fp1), bucket_key(&fp2));
765    }
766
767    #[test]
768    fn group_into_buckets_correct_grouping() {
769        let fps = vec![
770            StructuralFingerprint {
771                qualified_name: "a::f1".into(),
772                symbol_kind: SymbolKind::Function,
773                callee_count: 1,
774                caller_count: 0,
775                edge_kind_set: 0,
776                body_line_count: 10,
777                child_count: 0,
778                language: Language::Rust,
779                file: PathBuf::from("a.rs"),
780                line_start: 1,
781                line_end: 10,
782            },
783            StructuralFingerprint {
784                qualified_name: "b::f2".into(),
785                symbol_kind: SymbolKind::Function,
786                callee_count: 2,
787                caller_count: 0,
788                edge_kind_set: 0,
789                body_line_count: 12,
790                child_count: 0,
791                language: Language::Rust,
792                file: PathBuf::from("b.rs"),
793                line_start: 1,
794                line_end: 12,
795            },
796            StructuralFingerprint {
797                qualified_name: "c::f3".into(),
798                symbol_kind: SymbolKind::Class,
799                callee_count: 1,
800                caller_count: 0,
801                edge_kind_set: 0,
802                body_line_count: 10,
803                child_count: 0,
804                language: Language::Rust,
805                file: PathBuf::from("c.rs"),
806                line_start: 1,
807                line_end: 10,
808            },
809        ];
810        let buckets = group_into_buckets(&fps);
811        assert_eq!(buckets.len(), 2); // f1,f2 in one bucket; f3 in another
812        let max_bucket = buckets.values().map(|v| v.len()).max().unwrap();
813        assert_eq!(max_bucket, 2);
814    }
815
816    #[test]
817    fn count_bin_boundaries() {
818        assert_eq!(count_bin(0), 0);
819        assert_eq!(count_bin(1), 1);
820        assert_eq!(count_bin(2), 1);
821        assert_eq!(count_bin(3), 2);
822        assert_eq!(count_bin(5), 2);
823        assert_eq!(count_bin(6), 3);
824        assert_eq!(count_bin(100), 3);
825    }
826
827    // -----------------------------------------------------------------------
828    // T03 fingerprinting tests
829    // -----------------------------------------------------------------------
830
831    use crate::model::{Location, Visibility};
832
833    fn make_symbol(
834        name: &str,
835        kind: SymbolKind,
836        line_start: usize,
837        line_end: usize,
838    ) -> crate::model::SymbolNode {
839        crate::model::SymbolNode {
840            name: name.to_string(),
841            qualified_name: format!("test.rs::{name}"),
842            kind,
843            location: Location {
844                file: PathBuf::from("test.rs"),
845                line_start,
846                line_end,
847                col_start: 0,
848                col_end: 0,
849            },
850            visibility: Visibility::Public,
851            is_exported: false,
852            is_async: false,
853            is_test: false,
854            decorators: vec![],
855            signature: None,
856        }
857    }
858
859    fn make_edge(source: &str, target: &str, kind: EdgeKind) -> Edge {
860        Edge {
861            kind,
862            source: format!("test.rs::{source}"),
863            target: format!("test.rs::{target}"),
864            metadata: None,
865        }
866    }
867
868    #[test]
869    fn fingerprint_captures_symbol_structure() {
870        let symbols = vec![make_symbol("foo", SymbolKind::Function, 1, 20)];
871        let edges = vec![
872            make_edge("foo", "bar", EdgeKind::Calls),
873            make_edge("foo", "baz", EdgeKind::Calls),
874            make_edge("qux", "foo", EdgeKind::Calls),
875        ];
876        let config = CloneConfig {
877            min_lines: 1,
878            ..CloneConfig::default()
879        };
880
881        let fps = compute_fingerprints(&symbols, &edges, &config);
882
883        assert_eq!(fps.len(), 1);
884        let fp = &fps[0];
885        assert_eq!(fp.callee_count, 2, "should count 2 outgoing Calls edges");
886        assert_eq!(fp.caller_count, 1, "should count 1 incoming Calls edge");
887        assert_eq!(fp.body_line_count, 20, "line_end - line_start + 1 = 20");
888        assert_eq!(fp.child_count, 0, "no Contains edges");
889    }
890
891    #[test]
892    fn fingerprint_filters_by_min_lines() {
893        // Symbol with only 3 lines — below default min_lines of 5
894        let symbols = vec![make_symbol("tiny", SymbolKind::Function, 1, 3)];
895        let edges = vec![];
896        let config = CloneConfig::default(); // min_lines = 5
897
898        let fps = compute_fingerprints(&symbols, &edges, &config);
899
900        assert_eq!(fps.len(), 0, "symbol with 3 lines should be filtered out");
901    }
902
903    #[test]
904    fn fingerprint_counts_contains_edges_as_children() {
905        let symbols = vec![make_symbol("MyClass", SymbolKind::Class, 1, 50)];
906        let edges = vec![
907            make_edge("MyClass", "method_a", EdgeKind::Contains),
908            make_edge("MyClass", "method_b", EdgeKind::Contains),
909            make_edge("MyClass", "field_c", EdgeKind::Contains),
910        ];
911        let config = CloneConfig {
912            min_lines: 1,
913            ..CloneConfig::default()
914        };
915
916        let fps = compute_fingerprints(&symbols, &edges, &config);
917
918        assert_eq!(fps.len(), 1);
919        let fp = &fps[0];
920        assert_eq!(
921            fp.child_count, 3,
922            "should count 3 Contains edges as children"
923        );
924        assert_eq!(
925            fp.callee_count, 0,
926            "Contains has Structural confidence, not High"
927        );
928    }
929
930    #[test]
931    fn fingerprint_empty_input() {
932        let fps = compute_fingerprints(&[], &[], &CloneConfig::default());
933        assert_eq!(fps.len(), 0);
934    }
935
936    #[test]
937    fn fingerprint_edge_kind_bitset() {
938        let symbols = vec![make_symbol("node", SymbolKind::Function, 1, 10)];
939        let edges = vec![
940            make_edge("node", "a", EdgeKind::Calls),
941            make_edge("node", "b", EdgeKind::Extends),
942            make_edge("node", "c", EdgeKind::Contains),
943        ];
944        let config = CloneConfig {
945            min_lines: 1,
946            ..CloneConfig::default()
947        };
948
949        let fps = compute_fingerprints(&symbols, &edges, &config);
950        assert_eq!(fps.len(), 1);
951
952        let fp = &fps[0];
953        let calls_bit = 1u32 << edge_kind_index(&EdgeKind::Calls);
954        let extends_bit = 1u32 << edge_kind_index(&EdgeKind::Extends);
955        let contains_bit = 1u32 << edge_kind_index(&EdgeKind::Contains);
956
957        assert_ne!(fp.edge_kind_set & calls_bit, 0, "Calls bit should be set");
958        assert_ne!(
959            fp.edge_kind_set & extends_bit,
960            0,
961            "Extends bit should be set"
962        );
963        assert_ne!(
964            fp.edge_kind_set & contains_bit,
965            0,
966            "Contains bit should be set"
967        );
968
969        // A variant not in the edges should NOT be set
970        let imports_bit = 1u32 << edge_kind_index(&EdgeKind::ImportsFrom);
971        assert_eq!(
972            fp.edge_kind_set & imports_bit,
973            0,
974            "ImportsFrom bit should NOT be set"
975        );
976    }
977
978    // -----------------------------------------------------------------------
979    // T06 compare_pair and cluster_matches tests
980    // -----------------------------------------------------------------------
981
982    use crate::model::{CloneMatch, CloneType};
983
984    #[test]
985    fn compare_pair_type1_exact_match() {
986        let src = "fn foo(x: i32, y: i32) -> i32 { x + y }";
987        let result = compare_pair(src, src, false, 0.7);
988        assert!(result.is_some());
989        let m = result.unwrap();
990        assert_eq!(m.clone_type, CloneType::Type1);
991        assert!(m.similarity >= 0.95);
992    }
993
994    #[test]
995    fn compare_pair_type2_renamed_vars() {
996        let src_a = "fn add(x: i32, y: i32) -> i32 { x + y }";
997        let src_b = "fn sum(a: i32, b: i32) -> i32 { a + b }";
998        let result = compare_pair(src_a, src_b, false, 0.7);
999        assert!(result.is_some());
1000        assert_eq!(result.unwrap().clone_type, CloneType::Type2);
1001    }
1002
1003    #[test]
1004    fn compare_pair_cross_language_structural_only() {
1005        let result = compare_pair("", "", true, 0.7);
1006        assert!(result.is_some());
1007        assert_eq!(result.unwrap().clone_type, CloneType::StructuralOnly);
1008    }
1009
1010    #[test]
1011    fn compare_pair_below_threshold_returns_none() {
1012        let src_a = "fn add(x: i32) -> i32 { x + 1 }";
1013        let src_b = "class Foo { bar: string; baz(): void { console.log(this.bar); } }";
1014        let result = compare_pair(src_a, src_b, false, 0.7);
1015        assert!(result.is_none());
1016    }
1017
1018    #[test]
1019    fn cluster_transitive() {
1020        let matches = vec![
1021            CloneMatch {
1022                source: "a".into(),
1023                target: "b".into(),
1024                similarity: 0.8,
1025                clone_type: CloneType::Type2,
1026            },
1027            CloneMatch {
1028                source: "b".into(),
1029                target: "c".into(),
1030                similarity: 0.75,
1031                clone_type: CloneType::Type2,
1032            },
1033        ];
1034        let clusters = cluster_matches(&matches);
1035        assert_eq!(clusters.len(), 1);
1036        assert_eq!(clusters[0].members.len(), 3);
1037        assert_eq!(clusters[0].id, 1);
1038    }
1039
1040    #[test]
1041    fn cluster_separate_components() {
1042        let matches = vec![
1043            CloneMatch {
1044                source: "a".into(),
1045                target: "b".into(),
1046                similarity: 0.8,
1047                clone_type: CloneType::Type1,
1048            },
1049            CloneMatch {
1050                source: "c".into(),
1051                target: "d".into(),
1052                similarity: 0.9,
1053                clone_type: CloneType::Type1,
1054            },
1055        ];
1056        let clusters = cluster_matches(&matches);
1057        assert_eq!(clusters.len(), 2);
1058    }
1059
1060    #[test]
1061    fn cluster_empty_matches() {
1062        assert!(cluster_matches(&[]).is_empty());
1063    }
1064
1065    #[test]
1066    fn cluster_ids_descending_by_size() {
1067        let matches = vec![
1068            CloneMatch {
1069                source: "a".into(),
1070                target: "b".into(),
1071                similarity: 0.8,
1072                clone_type: CloneType::Type2,
1073            },
1074            CloneMatch {
1075                source: "c".into(),
1076                target: "d".into(),
1077                similarity: 0.9,
1078                clone_type: CloneType::Type1,
1079            },
1080            CloneMatch {
1081                source: "c".into(),
1082                target: "e".into(),
1083                similarity: 0.85,
1084                clone_type: CloneType::Type1,
1085            },
1086        ];
1087        let clusters = cluster_matches(&matches);
1088        assert_eq!(clusters[0].members.len(), 3); // c,d,e
1089        assert_eq!(clusters[0].id, 1);
1090        assert_eq!(clusters[1].members.len(), 2); // a,b
1091        assert_eq!(clusters[1].id, 2);
1092    }
1093}