cpd_core/
detect.rs

1// detect.rs
2
3use rayon::prelude::*;
4use rustc_hash::{FxHashMap, FxHashSet};
5use std::path::Path;
6
7use crate::{
8    hash::{base_pow, hash_window, roll, token_hash},
9    models::{CpdClone, DetectionToken, Fragment, Location, SourceFile, TokenKind},
10};
11
12// ---------------------------------------------------------------------------
13// Internal store type — replaces the Store trait + MemoryStore
14// ---------------------------------------------------------------------------
15
16/// Window store: maps a window hash to the last seen occurrence.
17/// Type alias — no trait indirection, no vtable, no dyn dispatch.
18type WindowStore = FxHashMap<u64, Occurrence>;
19
20/// Lightweight reference to a window position within a format-group detection call.
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22struct Occurrence {
23    /// Index into the `prepared` array for this `detect_in_group` call.
24    source_id: usize,
25    token_start: usize,
26}
27
28// ---------------------------------------------------------------------------
29// Deduplication key
30// ---------------------------------------------------------------------------
31
32#[derive(Debug, Clone, PartialEq, Eq, Hash)]
33struct CloneDedupKey {
34    a_id: String,
35    a_start_line: u32,
36    b_id: String,
37    b_start_line: u32,
38}
39
40impl CloneDedupKey {
41    fn from_clone(c: &CpdClone) -> Self {
42        // Normalize: smaller (id, line) first so (A,B) and (B,A) map to the same key.
43        let a_key = (&c.fragment_a.source_id, c.fragment_a.start.line);
44        let b_key = (&c.fragment_b.source_id, c.fragment_b.start.line);
45        if a_key <= b_key {
46            Self {
47                a_id: c.fragment_a.source_id.clone(),
48                a_start_line: c.fragment_a.start.line,
49                b_id: c.fragment_b.source_id.clone(),
50                b_start_line: c.fragment_b.start.line,
51            }
52        } else {
53            Self {
54                a_id: c.fragment_b.source_id.clone(),
55                a_start_line: c.fragment_b.start.line,
56                b_id: c.fragment_a.source_id.clone(),
57                b_start_line: c.fragment_a.start.line,
58            }
59        }
60    }
61}
62
63// ---------------------------------------------------------------------------
64// Public API — SourceFile path (for backward compat with tests)
65// ---------------------------------------------------------------------------
66
67/// Detect duplicate code clones across `files` using a rolling-hash sliding window.
68///
69/// Files are grouped by format; each format group is processed independently.
70/// Rayon is used for outer parallelism (one task per format group).
71pub fn detect(files: &[SourceFile], min_tokens: usize) -> Vec<CpdClone> {
72    detect_with_options(files, min_tokens, false, 0)
73}
74
75/// Detect clones with extended options.
76///
77/// - `skip_local`: skip clone pairs where both fragments share the same parent directory.
78/// - `min_lines`: reject clones whose fragment line span is shorter than this.
79///   The line span is `end.line - start.line`; a clone is kept only if this value
80///   is >= `min_lines`. This mirrors jscpd's `LinesLengthCloneValidator`.
81pub fn detect_with_options(
82    files: &[SourceFile],
83    min_tokens: usize,
84    skip_local: bool,
85    min_lines: usize,
86) -> Vec<CpdClone> {
87    if files.is_empty() || min_tokens == 0 {
88        return vec![];
89    }
90
91    // Group files by format. Sort for deterministic order.
92    let mut by_format: FxHashMap<&str, Vec<usize>> = FxHashMap::default();
93    for (idx, file) in files.iter().enumerate() {
94        by_format.entry(file.format.as_str()).or_default().push(idx);
95    }
96    let mut format_groups: Vec<(&str, Vec<usize>)> = by_format.into_iter().collect();
97    format_groups.sort_unstable_by_key(|(fmt, _)| *fmt);
98    for (_, group) in &mut format_groups {
99        group.sort_unstable_by_key(|&idx| files[idx].id.as_str());
100    }
101
102    let all_clones: Vec<Vec<CpdClone>> = format_groups
103        .into_par_iter()
104        .map(|(_format, indices)| {
105            // Build per-group prepared data from SourceFile.tokens.
106            // This is the backward-compat path; orchestrate.rs uses
107            // detect_prepared() directly to avoid re-hashing.
108            let prepared: Vec<PreparedSource> = indices
109                .iter()
110                .map(|&idx| {
111                    let file = &files[idx];
112                    let mut hashes = Vec::with_capacity(file.tokens.len());
113                    let mut spans: Vec<(Location, Location)> =
114                        Vec::with_capacity(file.tokens.len());
115                    for t in &file.tokens {
116                        if t.kind == TokenKind::Ignore {
117                            continue;
118                        }
119                        hashes.push(token_hash(t.kind.discriminant(), &t.value));
120                        spans.push((t.start.clone(), t.end.clone()));
121                    }
122                    PreparedSource {
123                        id: file.id.clone(),
124                        format: file.format.clone(),
125                        hashes,
126                        spans,
127                    }
128                })
129                .collect();
130            detect_in_group(&prepared, min_tokens, skip_local, min_lines)
131        })
132        .collect();
133
134    let mut clones: Vec<CpdClone> = all_clones.into_iter().flatten().collect();
135    dedup_exact_clones(&mut clones);
136    suppress_subclones(&mut clones);
137
138    clones.sort_by(|a, b| {
139        (
140            &a.fragment_a.source_id,
141            a.fragment_a.start.line,
142            &a.fragment_b.source_id,
143            a.fragment_b.start.line,
144        )
145            .cmp(&(
146                &b.fragment_a.source_id,
147                b.fragment_a.start.line,
148                &b.fragment_b.source_id,
149                b.fragment_b.start.line,
150            ))
151    });
152
153    clones
154}
155
156// ---------------------------------------------------------------------------
157// Direct DetectionToken path (called by orchestrate.rs)
158// ---------------------------------------------------------------------------
159
160/// A file ready for detection: pre-hashed, pre-filtered.
161///
162/// Produced either from `SourceFile.tokens` (backward compat) or directly from
163/// `tokenize_to_detection` output (fast path used by orchestrate.rs).
164pub struct PreparedSource {
165    pub id: String,
166    pub format: String,
167    pub hashes: Vec<u64>,
168    pub spans: Vec<(Location, Location)>,
169}
170
171impl PreparedSource {
172    /// Build from a `DetectionToken` slice — the fast path.
173    pub fn from_detection_tokens(id: String, format: String, tokens: &[DetectionToken]) -> Self {
174        let mut hashes = Vec::with_capacity(tokens.len());
175        let mut spans = Vec::with_capacity(tokens.len());
176        for t in tokens {
177            hashes.push(t.hash);
178            spans.push((t.start.clone(), t.end.clone()));
179        }
180        Self {
181            id,
182            format,
183            hashes,
184            spans,
185        }
186    }
187}
188
189/// Detect clones from pre-prepared sources grouped by format.
190///
191/// Called by orchestrate.rs after `tokenize_to_detection` — skips re-hashing.
192pub fn detect_prepared(
193    format_groups: Vec<Vec<PreparedSource>>,
194    min_tokens: usize,
195    skip_local: bool,
196    min_lines: usize,
197) -> Vec<CpdClone> {
198    if format_groups.is_empty() || min_tokens == 0 {
199        return vec![];
200    }
201
202    let all_clones: Vec<Vec<CpdClone>> = format_groups
203        .into_par_iter()
204        .map(|group| detect_in_group(&group, min_tokens, skip_local, min_lines))
205        .collect();
206
207    let mut clones: Vec<CpdClone> = all_clones.into_iter().flatten().collect();
208    dedup_exact_clones(&mut clones);
209    suppress_subclones(&mut clones);
210
211    clones.sort_by(|a, b| {
212        (
213            &a.fragment_a.source_id,
214            a.fragment_a.start.line,
215            &a.fragment_b.source_id,
216            a.fragment_b.start.line,
217        )
218            .cmp(&(
219                &b.fragment_a.source_id,
220                b.fragment_a.start.line,
221                &b.fragment_b.source_id,
222                b.fragment_b.start.line,
223            ))
224    });
225
226    clones
227}
228
229// ---------------------------------------------------------------------------
230// Core detection — per format group
231// ---------------------------------------------------------------------------
232
233fn detect_in_group(
234    prepared: &[PreparedSource],
235    min_tokens: usize,
236    skip_local: bool,
237    min_lines: usize,
238) -> Vec<CpdClone> {
239    // Precompute window_power once for this format group.
240    // If per-language min_tokens is introduced, recompute per group (it is already scoped here).
241    let window_power = base_pow(min_tokens.saturating_sub(1));
242
243    // Pre-allocate store capacity to avoid FxHashMap rehashing.
244    let total_windows: usize = prepared
245        .iter()
246        .map(|p| p.hashes.len().saturating_sub(min_tokens))
247        .sum();
248    let mut store: WindowStore =
249        FxHashMap::with_capacity_and_hasher(total_windows, Default::default());
250
251    let mut clones: Vec<CpdClone> = Vec::new();
252    // Cap at 2. Values > 2 cause unbounded memory growth on boilerplate-heavy codebases.
253    const SECONDARY_OCCURRENCE_CAP: usize = 2;
254    let mut repeated_windows: FxHashMap<u64, Vec<Occurrence>> = FxHashMap::default();
255
256    for (file_idx, source) in prepared.iter().enumerate() {
257        let hashes = &source.hashes;
258        if hashes.len() < min_tokens {
259            continue;
260        }
261        let windows_len = hashes.len() - min_tokens + 1;
262
263        // open_clone state machine: replaces emit-every-window + suppress_subclones.
264        // A clone is opened when a matching window is found and enlarged as long as
265        // subsequent windows also match. flush_clone is called when the match breaks
266        // or the file scan ends — only one clone per contiguous matching region.
267        let mut open_clone: Option<OpenClone> = None;
268
269        let mut window_hash = hash_window(&hashes[..min_tokens]);
270
271        for token_start in 0..windows_len {
272            if token_start > 0 {
273                window_hash = roll(
274                    window_hash,
275                    hashes[token_start - 1],
276                    hashes[token_start + min_tokens - 1],
277                    window_power,
278                );
279            }
280
281            let current = Occurrence {
282                source_id: file_idx,
283                token_start,
284            };
285
286            match store.get(&window_hash).copied() {
287                Some(stored) if windows_match(stored, current, prepared) => {
288                    if open_clone.is_none() {
289                        open_clone = Some(OpenClone {
290                            stored_occurrence: stored,
291                            current_start: token_start,
292                            match_len: min_tokens,
293                        });
294                    } else if let Some(ref mut oc) = open_clone {
295                        // Enlarge: the next window also matches — extend by one token.
296                        oc.match_len += 1;
297                    }
298                    remember_repeated_window(
299                        &mut repeated_windows,
300                        window_hash,
301                        stored,
302                        SECONDARY_OCCURRENCE_CAP,
303                    );
304                    remember_repeated_window(
305                        &mut repeated_windows,
306                        window_hash,
307                        current,
308                        SECONDARY_OCCURRENCE_CAP,
309                    );
310                    // Do NOT update store — keep the first occurrence so the enlargement
311                    // stays consistent across the contiguous match region.
312                }
313                _ => {
314                    // Match broke (or no entry). Flush whatever was open.
315                    flush_clone(
316                        open_clone.take(),
317                        file_idx,
318                        prepared,
319                        skip_local,
320                        min_lines,
321                        &mut clones,
322                    );
323                    store.insert(window_hash, current);
324                }
325            }
326        }
327
328        // Flush any open clone at the end of the file scan.
329        flush_clone(
330            open_clone.take(),
331            file_idx,
332            prepared,
333            skip_local,
334            min_lines,
335            &mut clones,
336        );
337    }
338
339    add_secondary_clones(
340        repeated_windows,
341        prepared,
342        min_tokens,
343        skip_local,
344        min_lines,
345        &mut clones,
346    );
347
348    clones
349}
350
351// ---------------------------------------------------------------------------
352// Open clone state machine helpers
353// ---------------------------------------------------------------------------
354
355struct OpenClone {
356    stored_occurrence: Occurrence,
357    current_start: usize,
358    match_len: usize,
359}
360
361/// Returns true if the window at `current` actually matches the window at `stored`
362/// (hash match is necessary but not sufficient — verify token equality).
363fn windows_match(stored: Occurrence, current: Occurrence, _prepared: &[PreparedSource]) -> bool {
364    if stored.source_id == current.source_id && stored.token_start == current.token_start {
365        return false; // same position — not a duplicate
366    }
367    // Hash match is sufficient for detection (Rabin-Karp assumption).
368    // We trust the hash; no secondary token-by-token verification here.
369    // The chance of false positives with xxh3_64 is negligible.
370    true
371}
372
373/// Flush an open clone to the clones list.
374///
375/// A clone is rejected if its line span is shorter than `min_lines`.
376/// The line span is measured as `end.line - start.line` (which equals
377/// `number_of_lines - 1`). Mirrors jscpd's `LinesLengthCloneValidator`.
378fn flush_clone(
379    open: Option<OpenClone>,
380    current_file_idx: usize,
381    prepared: &[PreparedSource],
382    skip_local: bool,
383    min_lines: usize,
384    clones: &mut Vec<CpdClone>,
385) {
386    let oc = match open {
387        Some(o) => o,
388        None => return,
389    };
390
391    let existing = &oc.stored_occurrence;
392    let cur_start = oc.current_start;
393    let match_len = oc.match_len;
394
395    let existing_file = &prepared[existing.source_id];
396    let current_file = &prepared[current_file_idx];
397
398    let ex_start = existing.token_start;
399    let ex_end = ex_start + match_len - 1;
400    let cur_end = cur_start + match_len - 1;
401
402    // Guard: don't emit a self-clone for windows that reference overlapping token
403    // ranges in the same file. Adjacent ranges (sharing exactly one boundary token)
404    // are allowed — they represent consecutive duplicate blocks.
405    //
406    // Overlap condition: the two regions [ex_start, ex_end] and [cur_start, cur_end]
407    // overlap if ex_start < cur_end AND cur_start < ex_end (strict less-than).
408    // Using strict less-than excludes the adjacent/boundary case.
409    if existing.source_id == current_file_idx {
410        let overlap = if ex_start < cur_start {
411            // ex region starts first; overlaps if ex_end > cur_start
412            ex_end > cur_start
413        } else {
414            // cur region starts first (or equal); overlaps if cur_end > ex_start
415            cur_end > ex_start
416        };
417        if overlap {
418            return;
419        }
420    }
421
422    // skip_local: drop clone pairs where both fragments share the same parent directory.
423    if skip_local {
424        let dir_a = Path::new(&existing_file.id).parent();
425        let dir_b = Path::new(&current_file.id).parent();
426        if dir_a == dir_b {
427            return;
428        }
429    }
430
431    let fragment_a = match make_fragment(&existing_file.id, &existing_file.spans, ex_start, ex_end)
432    {
433        Some(f) => f,
434        None => return,
435    };
436    let fragment_b = match make_fragment(&current_file.id, &current_file.spans, cur_start, cur_end)
437    {
438        Some(f) => f,
439        None => return,
440    };
441
442    // min_lines filter: reject clones whose line span is shorter than min_lines.
443    // Mirrors jscpd's LinesLengthCloneValidator which checks
444    //   duplicationA.end.line - duplicationA.start.line >= minLines
445    if min_lines > 0 {
446        let span_a = fragment_a.end.line as i64 - fragment_a.start.line as i64;
447        let span_b = fragment_b.end.line as i64 - fragment_b.start.line as i64;
448        if span_a < min_lines as i64 && span_b < min_lines as i64 {
449            return;
450        }
451    }
452
453    clones.push(CpdClone {
454        format: current_file.format.clone(),
455        fragment_a,
456        fragment_b,
457        token_count: match_len as u32,
458    });
459}
460
461fn make_fragment(
462    source_id: &str,
463    spans: &[(Location, Location)],
464    start_idx: usize,
465    end_idx: usize,
466) -> Option<Fragment> {
467    let (first_start, _) = spans.get(start_idx)?;
468    let (_, last_end) = spans.get(end_idx)?;
469    Some(Fragment {
470        source_id: source_id.to_string(),
471        start: first_start.clone(),
472        end: last_end.clone(),
473        range: [start_idx as u32, end_idx as u32],
474        blame: None,
475    })
476}
477
478// ---------------------------------------------------------------------------
479// Deduplication — O(n) FxHashSet + sub-clone suppression
480// ---------------------------------------------------------------------------
481
482fn dedup_exact_clones(clones: &mut Vec<CpdClone>) {
483    // Normalize each clone so fragment_a <= fragment_b (by id then start line).
484    for clone in clones.iter_mut() {
485        let a_key = (&clone.fragment_a.source_id, clone.fragment_a.start.line);
486        let b_key = (&clone.fragment_b.source_id, clone.fragment_b.start.line);
487        if a_key > b_key {
488            std::mem::swap(&mut clone.fragment_a, &mut clone.fragment_b);
489        }
490    }
491
492    let mut seen: FxHashSet<CloneDedupKey> = FxHashSet::default();
493    clones.retain(|c| seen.insert(CloneDedupKey::from_clone(c)));
494}
495
496/// Remove clones that are fully contained (by token range) within a larger clone
497/// of the same file pair. Largest clones first so containers are processed before
498/// their contained sub-clones.
499///
500/// This handles sub-clone outputs from the secondary pass and any edge cases
501/// where the primary emits adjacent-region duplicates.
502fn suppress_subclones(clones: &mut Vec<CpdClone>) {
503    use std::cmp::Reverse;
504    clones.sort_by_key(|c| Reverse(c.token_count));
505
506    let n = clones.len();
507    let mut keep = vec![true; n];
508
509    for i in 0..n {
510        if !keep[i] {
511            continue;
512        }
513        let big_a_id = clones[i].fragment_a.source_id.clone();
514        let big_b_id = clones[i].fragment_b.source_id.clone();
515        let big_a_range = clones[i].fragment_a.range;
516        let big_b_range = clones[i].fragment_b.range;
517
518        for j in (i + 1)..n {
519            if !keep[j] {
520                continue;
521            }
522            let small = &clones[j];
523            if small.fragment_a.source_id != big_a_id || small.fragment_b.source_id != big_b_id {
524                continue;
525            }
526            // Sub-clone: small's both fragments are contained within big's ranges.
527            if big_a_range[0] <= small.fragment_a.range[0]
528                && big_a_range[1] >= small.fragment_a.range[1]
529                && big_b_range[0] <= small.fragment_b.range[0]
530                && big_b_range[1] >= small.fragment_b.range[1]
531            {
532                keep[j] = false;
533            }
534        }
535    }
536
537    let mut i = 0;
538    clones.retain(|_| {
539        let k = keep[i];
540        i += 1;
541        k
542    });
543}
544
545// ---------------------------------------------------------------------------
546// Secondary clone pass
547// ---------------------------------------------------------------------------
548
549fn remember_repeated_window(
550    repeated_windows: &mut FxHashMap<u64, Vec<Occurrence>>,
551    hash: u64,
552    occurrence: Occurrence,
553    cap: usize,
554) {
555    let bucket = repeated_windows.entry(hash).or_default();
556    if bucket
557        .iter()
558        .any(|s| s.source_id == occurrence.source_id && s.token_start == occurrence.token_start)
559    {
560        return;
561    }
562    if bucket.len() < cap {
563        bucket.push(occurrence);
564    }
565}
566
567fn add_secondary_clones(
568    repeated_windows: FxHashMap<u64, Vec<Occurrence>>,
569    prepared: &[PreparedSource],
570    min_tokens: usize,
571    skip_local: bool,
572    min_lines: usize,
573    clones: &mut Vec<CpdClone>,
574) {
575    if repeated_windows.is_empty() {
576        return;
577    }
578
579    #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
580    struct Candidate {
581        source_a: usize,
582        source_b: usize,
583        token_a: usize,
584        token_b: usize,
585    }
586
587    let mut candidates: Vec<Candidate> = Vec::new();
588    for occurrences in repeated_windows.values() {
589        if occurrences.len() < 2 {
590            continue;
591        }
592        for li in 0..occurrences.len() {
593            for ri in li + 1..occurrences.len() {
594                let left = &occurrences[li];
595                let right = &occurrences[ri];
596                if left.source_id == right.source_id && left.token_start == right.token_start {
597                    continue;
598                }
599                let lh = &prepared[left.source_id].hashes;
600                let rh = &prepared[right.source_id].hashes;
601                let la = left.token_start;
602                let ra = right.token_start;
603                if la + min_tokens > lh.len() || ra + min_tokens > rh.len() {
604                    continue;
605                }
606                if lh[la..la + min_tokens] != rh[ra..ra + min_tokens] {
607                    continue;
608                }
609                let (sa, ta, sb, tb) =
610                    if (left.source_id, left.token_start) <= (right.source_id, right.token_start) {
611                        (
612                            left.source_id,
613                            left.token_start,
614                            right.source_id,
615                            right.token_start,
616                        )
617                    } else {
618                        (
619                            right.source_id,
620                            right.token_start,
621                            left.source_id,
622                            left.token_start,
623                        )
624                    };
625                candidates.push(Candidate {
626                    source_a: sa,
627                    source_b: sb,
628                    token_a: ta,
629                    token_b: tb,
630                });
631            }
632        }
633    }
634    if candidates.is_empty() {
635        return;
636    }
637    candidates.sort_unstable();
638    candidates.dedup();
639
640    // Build line-coverage from already-found primary clones.
641    let n_files = prepared.len();
642    let mut covered: Vec<Vec<(u32, u32)>> = vec![Vec::new(); n_files];
643    for c in clones.iter() {
644        let fa = prepared.iter().position(|p| p.id == c.fragment_a.source_id);
645        let fb = prepared.iter().position(|p| p.id == c.fragment_b.source_id);
646        if let Some(idx) = fa {
647            covered[idx].push((c.fragment_a.start.line, c.fragment_a.end.line));
648        }
649        if let Some(idx) = fb {
650            covered[idx].push((c.fragment_b.start.line, c.fragment_b.end.line));
651        }
652    }
653    for ranges in &mut covered {
654        ranges.sort_unstable();
655    }
656
657    let line_extends_coverage = |file_idx: usize, start: u32, end: u32| -> bool {
658        let ranges = &covered[file_idx];
659        let mut next = start;
660        for &(rs, re) in ranges {
661            if re < next {
662                continue;
663            }
664            if rs > next {
665                return true;
666            }
667            next = next.max(re.saturating_add(1));
668            if next > end {
669                return false;
670            }
671        }
672        next <= end
673    };
674
675    for cand in candidates {
676        // Build the clone using flush_clone's logic (direct fragment construction).
677        let existing = Occurrence {
678            source_id: cand.source_a,
679            token_start: cand.token_a,
680        };
681        let current = Occurrence {
682            source_id: cand.source_b,
683            token_start: cand.token_b,
684        };
685
686        // skip_local check
687        if skip_local {
688            let dir_a = Path::new(&prepared[existing.source_id].id).parent();
689            let dir_b = Path::new(&prepared[current.source_id].id).parent();
690            if dir_a == dir_b {
691                continue;
692            }
693        }
694
695        // Greedy extend.
696        let ex_hashes = &prepared[existing.source_id].hashes;
697        let cur_hashes = &prepared[current.source_id].hashes;
698        let max_extend = (ex_hashes
699            .len()
700            .saturating_sub(existing.token_start + min_tokens))
701        .min(
702            cur_hashes
703                .len()
704                .saturating_sub(current.token_start + min_tokens),
705        );
706        let mut extra = 0usize;
707        while extra < max_extend
708            && ex_hashes[existing.token_start + min_tokens + extra]
709                == cur_hashes[current.token_start + min_tokens + extra]
710        {
711            extra += 1;
712        }
713        let match_len = min_tokens + extra;
714
715        let ex_start = existing.token_start;
716        let ex_end = ex_start + match_len - 1;
717        let cur_start = current.token_start;
718        let cur_end = cur_start + match_len - 1;
719
720        let frag_a = match make_fragment(
721            &prepared[existing.source_id].id,
722            &prepared[existing.source_id].spans,
723            ex_start,
724            ex_end,
725        ) {
726            Some(f) => f,
727            None => continue,
728        };
729        let frag_b = match make_fragment(
730            &prepared[current.source_id].id,
731            &prepared[current.source_id].spans,
732            cur_start,
733            cur_end,
734        ) {
735            Some(f) => f,
736            None => continue,
737        };
738
739        let nc = CpdClone {
740            format: prepared[current.source_id].format.clone(),
741            fragment_a: frag_a,
742            fragment_b: frag_b,
743            token_count: match_len as u32,
744        };
745
746        // min_lines filter for secondary clones too.
747        if min_lines > 0 {
748            let span_a = nc.fragment_a.end.line as i64 - nc.fragment_a.start.line as i64;
749            let span_b = nc.fragment_b.end.line as i64 - nc.fragment_b.start.line as i64;
750            if span_a < min_lines as i64 && span_b < min_lines as i64 {
751                continue;
752            }
753        }
754
755        let extends_a = line_extends_coverage(
756            cand.source_a,
757            nc.fragment_a.start.line,
758            nc.fragment_a.end.line,
759        );
760        let extends_b = line_extends_coverage(
761            cand.source_b,
762            nc.fragment_b.start.line,
763            nc.fragment_b.end.line,
764        );
765        if extends_a || extends_b {
766            clones.push(nc);
767        }
768    }
769}
770
771// ---------------------------------------------------------------------------
772// Tests
773// ---------------------------------------------------------------------------
774
775#[cfg(test)]
776mod tests {
777    use super::*;
778    use crate::models::{Location, Token, TokenKind};
779
780    fn loc(line: u32, col: u32, offset: u32) -> Location {
781        Location {
782            line,
783            column: col,
784            offset,
785        }
786    }
787
788    fn make_token(kind: TokenKind, value: &str, line: u32, col: u32, offset: u32) -> Token {
789        let end_col = col + value.len() as u32;
790        let end_off = offset + value.len() as u32;
791        Token {
792            kind,
793            value: value.to_string(),
794            start: loc(line, col, offset),
795            end: loc(line, end_col, end_off),
796        }
797    }
798
799    fn make_file(id: &str, format: &str, tokens: Vec<Token>) -> SourceFile {
800        SourceFile {
801            id: id.to_string(),
802            format: format.to_string(),
803            tokens,
804        }
805    }
806
807    fn js_tokens_ab() -> Vec<Token> {
808        vec![
809            make_token(TokenKind::Keyword, "function", 1, 0, 0),
810            make_token(TokenKind::Other, "hello", 1, 9, 9),
811            make_token(TokenKind::Operator, "(", 1, 14, 14),
812            make_token(TokenKind::Operator, ")", 1, 15, 15),
813            make_token(TokenKind::Operator, "{", 1, 16, 16),
814            make_token(TokenKind::Keyword, "return", 2, 0, 18),
815            make_token(TokenKind::Literal, "42", 2, 7, 25),
816            make_token(TokenKind::Operator, ";", 2, 9, 27),
817            make_token(TokenKind::Operator, "}", 3, 0, 29),
818        ]
819    }
820
821    #[test]
822    fn empty_input_returns_empty() {
823        let result = detect(&[], 10);
824        assert!(result.is_empty());
825    }
826
827    #[test]
828    fn identical_files_detected_as_clone() {
829        let tokens = js_tokens_ab();
830        let file_a = make_file("a.js", "javascript", tokens.clone());
831        let file_b = make_file("b.js", "javascript", tokens);
832        let clones = detect(&[file_a, file_b], 5);
833        assert!(
834            !clones.is_empty(),
835            "identical files must produce at least one clone"
836        );
837    }
838
839    #[test]
840    fn min_tokens_threshold_respected() {
841        let tokens = js_tokens_ab(); // 9 tokens
842        let file_a = make_file("a.js", "javascript", tokens.clone());
843        let file_b = make_file("b.js", "javascript", tokens);
844        let clones = detect(&[file_a, file_b], 100);
845        assert!(
846            clones.is_empty(),
847            "no clones when min_tokens exceeds file length"
848        );
849    }
850
851    #[test]
852    fn deduplication_ab_ba_collapse() {
853        let tokens = js_tokens_ab();
854        let file_a = make_file("a.js", "javascript", tokens.clone());
855        let file_b = make_file("b.js", "javascript", tokens);
856        let clones = detect(&[file_a, file_b], 5);
857        // Should get exactly 1 clone pair, not 2 symmetric ones.
858        assert_eq!(clones.len(), 1, "symmetric pairs must collapse to 1");
859    }
860
861    #[test]
862    fn different_formats_not_cross_detected() {
863        let tokens = js_tokens_ab();
864        let file_js = make_file("a.js", "javascript", tokens.clone());
865        let file_py = make_file("a.py", "python", tokens);
866        let clones = detect(&[file_js, file_py], 5);
867        assert!(
868            clones.is_empty(),
869            "tokens from different formats must not match"
870        );
871    }
872
873    #[test]
874    fn identical_files_maximal_clone() {
875        // With the open_clone state machine, a single maximal clone is emitted
876        // instead of multiple sliding-window sub-clones.
877        let tokens = js_tokens_ab();
878        let file_a = make_file("a.js", "javascript", tokens.clone());
879        let file_b = make_file("b.js", "javascript", tokens);
880        let clones = detect(&[file_a, file_b], 5);
881        assert_eq!(
882            clones.len(),
883            1,
884            "open_clone SM must produce one maximal clone"
885        );
886        assert_eq!(
887            clones[0].token_count, 9,
888            "maximal clone must cover all 9 tokens"
889        );
890    }
891
892    #[test]
893    fn three_identical_files_secondary_pass_adds_missing_pair() {
894        let tokens = js_tokens_ab();
895        let file_a = make_file("a.js", "javascript", tokens.clone());
896        let file_b = make_file("b.js", "javascript", tokens.clone());
897        let file_c = make_file("c.js", "javascript", tokens);
898        let clones = detect(&[file_a, file_b, file_c], 5);
899        assert!(
900            clones.len() >= 2,
901            "three identical files must yield at least 2 clone pairs, got {}",
902            clones.len()
903        );
904    }
905
906    #[test]
907    fn clones_sorted_by_source_and_line() {
908        let tokens = js_tokens_ab();
909        let file_a = make_file("a.js", "javascript", tokens.clone());
910        let file_b = make_file("b.js", "javascript", tokens);
911        let clones = detect(&[file_a, file_b], 5);
912        for i in 1..clones.len() {
913            let prev = &clones[i - 1];
914            let curr = &clones[i];
915            assert!(
916                (
917                    &prev.fragment_a.source_id,
918                    prev.fragment_a.start.line,
919                    &prev.fragment_b.source_id,
920                    prev.fragment_b.start.line,
921                ) <= (
922                    &curr.fragment_a.source_id,
923                    curr.fragment_a.start.line,
924                    &curr.fragment_b.source_id,
925                    curr.fragment_b.start.line,
926                ),
927                "clones must be sorted"
928            );
929        }
930    }
931}
cpd_core/detect.rs

cpd_core/
detect.rs