cpd_core/
detect.rs

1// detect.rs
2// Attribution: sliding-window Rabin-Karp clone detection; inspired by jscpd-rs approach; rewritten independently.
3
4use rayon::prelude::*;
5use rustc_hash::{FxHashMap, FxHashSet};
6use std::path::Path;
7
8use crate::{
9    hash::{base_pow, hash_window, roll, token_hash},
10    models::{CpdClone, DetectionToken, Fragment, Location, SourceFile, TokenKind},
11};
12
13// ---------------------------------------------------------------------------
14// Internal store type — replaces the Store trait + MemoryStore
15// ---------------------------------------------------------------------------
16
17/// Window store: maps a window hash to the last seen occurrence.
18/// Type alias — no trait indirection, no vtable, no dyn dispatch.
19type WindowStore = FxHashMap<u64, Occurrence>;
20
21/// Lightweight reference to a window position within a format-group detection call.
22#[derive(Debug, Clone, Copy, PartialEq, Eq)]
23struct Occurrence {
24    /// Index into the `prepared` array for this `detect_in_group` call.
25    source_id: usize,
26    token_start: usize,
27}
28
29// ---------------------------------------------------------------------------
30// Deduplication key
31// ---------------------------------------------------------------------------
32
33#[derive(Debug, Clone, PartialEq, Eq, Hash)]
34struct CloneDedupKey {
35    a_id: String,
36    a_start_line: u32,
37    b_id: String,
38    b_start_line: u32,
39}
40
41impl CloneDedupKey {
42    fn from_clone(c: &CpdClone) -> Self {
43        // Normalize: smaller (id, line) first so (A,B) and (B,A) map to the same key.
44        let a_key = (&c.fragment_a.source_id, c.fragment_a.start.line);
45        let b_key = (&c.fragment_b.source_id, c.fragment_b.start.line);
46        if a_key <= b_key {
47            Self {
48                a_id: c.fragment_a.source_id.clone(),
49                a_start_line: c.fragment_a.start.line,
50                b_id: c.fragment_b.source_id.clone(),
51                b_start_line: c.fragment_b.start.line,
52            }
53        } else {
54            Self {
55                a_id: c.fragment_b.source_id.clone(),
56                a_start_line: c.fragment_b.start.line,
57                b_id: c.fragment_a.source_id.clone(),
58                b_start_line: c.fragment_a.start.line,
59            }
60        }
61    }
62}
63
64// ---------------------------------------------------------------------------
65// Public API — SourceFile path (for backward compat with tests)
66// ---------------------------------------------------------------------------
67
68/// Detect duplicate code clones across `files` using a rolling-hash sliding window.
69///
70/// Files are grouped by format; each format group is processed independently.
71/// Rayon is used for outer parallelism (one task per format group).
72pub fn detect(files: &[SourceFile], min_tokens: usize) -> Vec<CpdClone> {
73    detect_with_options(files, min_tokens, false, 0)
74}
75
76/// Detect clones with extended options.
77///
78/// - `skip_local`: skip clone pairs where both fragments share the same parent directory.
79/// - `min_lines`: reject clones whose fragment line span is shorter than this.
80///   The line span is `end.line - start.line`; a clone is kept only if this value
81///   is >= `min_lines`. This mirrors jscpd's `LinesLengthCloneValidator`.
82pub fn detect_with_options(
83    files: &[SourceFile],
84    min_tokens: usize,
85    skip_local: bool,
86    min_lines: usize,
87) -> Vec<CpdClone> {
88    if files.is_empty() || min_tokens == 0 {
89        return vec![];
90    }
91
92    // Group files by format. Sort for deterministic order.
93    let mut by_format: FxHashMap<&str, Vec<usize>> = FxHashMap::default();
94    for (idx, file) in files.iter().enumerate() {
95        by_format.entry(file.format.as_str()).or_default().push(idx);
96    }
97    let mut format_groups: Vec<(&str, Vec<usize>)> = by_format.into_iter().collect();
98    format_groups.sort_unstable_by_key(|(fmt, _)| *fmt);
99    for (_, group) in &mut format_groups {
100        group.sort_unstable_by_key(|&idx| files[idx].id.as_str());
101    }
102
103    let all_clones: Vec<Vec<CpdClone>> = format_groups
104        .into_par_iter()
105        .map(|(_format, indices)| {
106            // Build per-group prepared data from SourceFile.tokens.
107            // This is the backward-compat path; orchestrate.rs uses
108            // detect_prepared() directly to avoid re-hashing.
109            let prepared: Vec<PreparedSource> = indices
110                .iter()
111                .map(|&idx| {
112                    let file = &files[idx];
113                    let mut hashes = Vec::with_capacity(file.tokens.len());
114                    let mut spans: Vec<(Location, Location)> =
115                        Vec::with_capacity(file.tokens.len());
116                    for t in &file.tokens {
117                        if t.kind == TokenKind::Ignore {
118                            continue;
119                        }
120                        hashes.push(token_hash(t.kind.discriminant(), &t.value));
121                        spans.push((t.start.clone(), t.end.clone()));
122                    }
123                    PreparedSource {
124                        id: file.id.clone(),
125                        format: file.format.clone(),
126                        hashes,
127                        spans,
128                    }
129                })
130                .collect();
131            detect_in_group(&prepared, min_tokens, skip_local, min_lines)
132        })
133        .collect();
134
135    let mut clones: Vec<CpdClone> = all_clones.into_iter().flatten().collect();
136    dedup_exact_clones(&mut clones);
137    suppress_subclones(&mut clones);
138
139    clones.sort_by(|a, b| {
140        (
141            &a.fragment_a.source_id,
142            a.fragment_a.start.line,
143            &a.fragment_b.source_id,
144            a.fragment_b.start.line,
145        )
146            .cmp(&(
147                &b.fragment_a.source_id,
148                b.fragment_a.start.line,
149                &b.fragment_b.source_id,
150                b.fragment_b.start.line,
151            ))
152    });
153
154    clones
155}
156
157// ---------------------------------------------------------------------------
158// Direct DetectionToken path (called by orchestrate.rs)
159// ---------------------------------------------------------------------------
160
161/// A file ready for detection: pre-hashed, pre-filtered.
162///
163/// Produced either from `SourceFile.tokens` (backward compat) or directly from
164/// `tokenize_to_detection` output (fast path used by orchestrate.rs).
165pub struct PreparedSource {
166    pub id: String,
167    pub format: String,
168    pub hashes: Vec<u64>,
169    pub spans: Vec<(Location, Location)>,
170}
171
172impl PreparedSource {
173    /// Build from a `DetectionToken` slice — the fast path.
174    pub fn from_detection_tokens(id: String, format: String, tokens: &[DetectionToken]) -> Self {
175        let mut hashes = Vec::with_capacity(tokens.len());
176        let mut spans = Vec::with_capacity(tokens.len());
177        for t in tokens {
178            hashes.push(t.hash);
179            spans.push((t.start.clone(), t.end.clone()));
180        }
181        Self {
182            id,
183            format,
184            hashes,
185            spans,
186        }
187    }
188}
189
190/// Detect clones from pre-prepared sources grouped by format.
191///
192/// Called by orchestrate.rs after `tokenize_to_detection` — skips re-hashing.
193pub fn detect_prepared(
194    format_groups: Vec<Vec<PreparedSource>>,
195    min_tokens: usize,
196    skip_local: bool,
197    min_lines: usize,
198) -> Vec<CpdClone> {
199    if format_groups.is_empty() || min_tokens == 0 {
200        return vec![];
201    }
202
203    let all_clones: Vec<Vec<CpdClone>> = format_groups
204        .into_par_iter()
205        .map(|group| detect_in_group(&group, min_tokens, skip_local, min_lines))
206        .collect();
207
208    let mut clones: Vec<CpdClone> = all_clones.into_iter().flatten().collect();
209    dedup_exact_clones(&mut clones);
210    suppress_subclones(&mut clones);
211
212    clones.sort_by(|a, b| {
213        (
214            &a.fragment_a.source_id,
215            a.fragment_a.start.line,
216            &a.fragment_b.source_id,
217            a.fragment_b.start.line,
218        )
219            .cmp(&(
220                &b.fragment_a.source_id,
221                b.fragment_a.start.line,
222                &b.fragment_b.source_id,
223                b.fragment_b.start.line,
224            ))
225    });
226
227    clones
228}
229
230// ---------------------------------------------------------------------------
231// Core detection — per format group
232// ---------------------------------------------------------------------------
233
234fn detect_in_group(
235    prepared: &[PreparedSource],
236    min_tokens: usize,
237    skip_local: bool,
238    min_lines: usize,
239) -> Vec<CpdClone> {
240    // Precompute window_power once for this format group.
241    // If per-language min_tokens is introduced, recompute per group (it is already scoped here).
242    let window_power = base_pow(min_tokens.saturating_sub(1));
243
244    // Pre-allocate store capacity to avoid FxHashMap rehashing.
245    let total_windows: usize = prepared
246        .iter()
247        .map(|p| p.hashes.len().saturating_sub(min_tokens))
248        .sum();
249    let mut store: WindowStore =
250        FxHashMap::with_capacity_and_hasher(total_windows, Default::default());
251
252    let mut clones: Vec<CpdClone> = Vec::new();
253    // Cap at 2 — matching jscpd-rs SECONDARY_OCCURRENCE_CAP.
254    // Values > 2 cause unbounded memory growth on boilerplate-heavy codebases.
255    const SECONDARY_OCCURRENCE_CAP: usize = 2;
256    let mut repeated_windows: FxHashMap<u64, Vec<Occurrence>> = FxHashMap::default();
257
258    for (file_idx, source) in prepared.iter().enumerate() {
259        let hashes = &source.hashes;
260        if hashes.len() < min_tokens {
261            continue;
262        }
263        let windows_len = hashes.len() - min_tokens + 1;
264
265        // open_clone state machine: replaces emit-every-window + suppress_subclones.
266        // A clone is opened when a matching window is found and enlarged as long as
267        // subsequent windows also match. flush_clone is called when the match breaks
268        // or the file scan ends — only one clone per contiguous matching region.
269        let mut open_clone: Option<OpenClone> = None;
270
271        let mut window_hash = hash_window(&hashes[..min_tokens]);
272
273        for token_start in 0..windows_len {
274            if token_start > 0 {
275                window_hash = roll(
276                    window_hash,
277                    hashes[token_start - 1],
278                    hashes[token_start + min_tokens - 1],
279                    window_power,
280                );
281            }
282
283            let current = Occurrence {
284                source_id: file_idx,
285                token_start,
286            };
287
288            match store.get(&window_hash).copied() {
289                Some(stored) if windows_match(stored, current, prepared) => {
290                    if open_clone.is_none() {
291                        open_clone = Some(OpenClone {
292                            stored_occurrence: stored,
293                            current_start: token_start,
294                            match_len: min_tokens,
295                        });
296                    } else if let Some(ref mut oc) = open_clone {
297                        // Enlarge: the next window also matches — extend by one token.
298                        oc.match_len += 1;
299                    }
300                    remember_repeated_window(
301                        &mut repeated_windows,
302                        window_hash,
303                        stored,
304                        SECONDARY_OCCURRENCE_CAP,
305                    );
306                    remember_repeated_window(
307                        &mut repeated_windows,
308                        window_hash,
309                        current,
310                        SECONDARY_OCCURRENCE_CAP,
311                    );
312                    // Do NOT update store — keep the first occurrence so the enlargement
313                    // stays consistent across the contiguous match region.
314                }
315                _ => {
316                    // Match broke (or no entry). Flush whatever was open.
317                    flush_clone(
318                        open_clone.take(),
319                        file_idx,
320                        prepared,
321                        skip_local,
322                        min_lines,
323                        &mut clones,
324                    );
325                    store.insert(window_hash, current);
326                }
327            }
328        }
329
330        // Flush any open clone at the end of the file scan.
331        flush_clone(
332            open_clone.take(),
333            file_idx,
334            prepared,
335            skip_local,
336            min_lines,
337            &mut clones,
338        );
339    }
340
341    add_secondary_clones(
342        repeated_windows,
343        prepared,
344        min_tokens,
345        skip_local,
346        min_lines,
347        &mut clones,
348    );
349
350    clones
351}
352
353// ---------------------------------------------------------------------------
354// Open clone state machine helpers
355// ---------------------------------------------------------------------------
356
357struct OpenClone {
358    stored_occurrence: Occurrence,
359    current_start: usize,
360    match_len: usize,
361}
362
363/// Returns true if the window at `current` actually matches the window at `stored`
364/// (hash match is necessary but not sufficient — verify token equality).
365fn windows_match(stored: Occurrence, current: Occurrence, _prepared: &[PreparedSource]) -> bool {
366    if stored.source_id == current.source_id && stored.token_start == current.token_start {
367        return false; // same position — not a duplicate
368    }
369    // Hash match is sufficient for detection (Rabin-Karp assumption).
370    // We trust the hash; no secondary token-by-token verification here.
371    // The chance of false positives with xxh3_64 is negligible.
372    true
373}
374
375/// Flush an open clone to the clones list.
376///
377/// A clone is rejected if its line span is shorter than `min_lines`.
378/// The line span is measured as `end.line - start.line` (which equals
379/// `number_of_lines - 1`). Mirrors jscpd's `LinesLengthCloneValidator`.
380fn flush_clone(
381    open: Option<OpenClone>,
382    current_file_idx: usize,
383    prepared: &[PreparedSource],
384    skip_local: bool,
385    min_lines: usize,
386    clones: &mut Vec<CpdClone>,
387) {
388    let oc = match open {
389        Some(o) => o,
390        None => return,
391    };
392
393    let existing = &oc.stored_occurrence;
394    let cur_start = oc.current_start;
395    let match_len = oc.match_len;
396
397    let existing_file = &prepared[existing.source_id];
398    let current_file = &prepared[current_file_idx];
399
400    let ex_start = existing.token_start;
401    let ex_end = ex_start + match_len - 1;
402    let cur_end = cur_start + match_len - 1;
403
404    // Guard: don't emit a self-clone for windows that reference overlapping token
405    // ranges in the same file. Adjacent ranges (sharing exactly one boundary token)
406    // are allowed — they represent consecutive duplicate blocks.
407    //
408    // Overlap condition: the two regions [ex_start, ex_end] and [cur_start, cur_end]
409    // overlap if ex_start < cur_end AND cur_start < ex_end (strict less-than).
410    // Using strict less-than excludes the adjacent/boundary case.
411    if existing.source_id == current_file_idx {
412        let overlap = if ex_start < cur_start {
413            // ex region starts first; overlaps if ex_end > cur_start
414            ex_end > cur_start
415        } else {
416            // cur region starts first (or equal); overlaps if cur_end > ex_start
417            cur_end > ex_start
418        };
419        if overlap {
420            return;
421        }
422    }
423
424    // skip_local: drop clone pairs where both fragments share the same parent directory.
425    if skip_local {
426        let dir_a = Path::new(&existing_file.id).parent();
427        let dir_b = Path::new(&current_file.id).parent();
428        if dir_a == dir_b {
429            return;
430        }
431    }
432
433    let fragment_a = match make_fragment(&existing_file.id, &existing_file.spans, ex_start, ex_end)
434    {
435        Some(f) => f,
436        None => return,
437    };
438    let fragment_b = match make_fragment(&current_file.id, &current_file.spans, cur_start, cur_end)
439    {
440        Some(f) => f,
441        None => return,
442    };
443
444    // min_lines filter: reject clones whose line span is shorter than min_lines.
445    // Mirrors jscpd's LinesLengthCloneValidator which checks
446    //   duplicationA.end.line - duplicationA.start.line >= minLines
447    if min_lines > 0 {
448        let span_a = fragment_a.end.line as i64 - fragment_a.start.line as i64;
449        let span_b = fragment_b.end.line as i64 - fragment_b.start.line as i64;
450        if span_a < min_lines as i64 && span_b < min_lines as i64 {
451            return;
452        }
453    }
454
455    clones.push(CpdClone {
456        format: current_file.format.clone(),
457        fragment_a,
458        fragment_b,
459        token_count: match_len as u32,
460    });
461}
462
463fn make_fragment(
464    source_id: &str,
465    spans: &[(Location, Location)],
466    start_idx: usize,
467    end_idx: usize,
468) -> Option<Fragment> {
469    let (first_start, _) = spans.get(start_idx)?;
470    let (_, last_end) = spans.get(end_idx)?;
471    Some(Fragment {
472        source_id: source_id.to_string(),
473        start: first_start.clone(),
474        end: last_end.clone(),
475        range: [start_idx as u32, end_idx as u32],
476        blame: None,
477    })
478}
479
480// ---------------------------------------------------------------------------
481// Deduplication — O(n) FxHashSet + sub-clone suppression
482// ---------------------------------------------------------------------------
483
484fn dedup_exact_clones(clones: &mut Vec<CpdClone>) {
485    // Normalize each clone so fragment_a <= fragment_b (by id then start line).
486    for clone in clones.iter_mut() {
487        let a_key = (&clone.fragment_a.source_id, clone.fragment_a.start.line);
488        let b_key = (&clone.fragment_b.source_id, clone.fragment_b.start.line);
489        if a_key > b_key {
490            std::mem::swap(&mut clone.fragment_a, &mut clone.fragment_b);
491        }
492    }
493
494    let mut seen: FxHashSet<CloneDedupKey> = FxHashSet::default();
495    clones.retain(|c| seen.insert(CloneDedupKey::from_clone(c)));
496}
497
498/// Remove clones that are fully contained (by token range) within a larger clone
499/// of the same file pair. Largest clones first so containers are processed before
500/// their contained sub-clones.
501///
502/// This handles sub-clone outputs from the secondary pass and any edge cases
503/// where the primary emits adjacent-region duplicates.
504fn suppress_subclones(clones: &mut Vec<CpdClone>) {
505    use std::cmp::Reverse;
506    clones.sort_by_key(|c| Reverse(c.token_count));
507
508    let n = clones.len();
509    let mut keep = vec![true; n];
510
511    for i in 0..n {
512        if !keep[i] {
513            continue;
514        }
515        let big_a_id = clones[i].fragment_a.source_id.clone();
516        let big_b_id = clones[i].fragment_b.source_id.clone();
517        let big_a_range = clones[i].fragment_a.range;
518        let big_b_range = clones[i].fragment_b.range;
519
520        for j in (i + 1)..n {
521            if !keep[j] {
522                continue;
523            }
524            let small = &clones[j];
525            if small.fragment_a.source_id != big_a_id || small.fragment_b.source_id != big_b_id {
526                continue;
527            }
528            // Sub-clone: small's both fragments are contained within big's ranges.
529            if big_a_range[0] <= small.fragment_a.range[0]
530                && big_a_range[1] >= small.fragment_a.range[1]
531                && big_b_range[0] <= small.fragment_b.range[0]
532                && big_b_range[1] >= small.fragment_b.range[1]
533            {
534                keep[j] = false;
535            }
536        }
537    }
538
539    let mut i = 0;
540    clones.retain(|_| {
541        let k = keep[i];
542        i += 1;
543        k
544    });
545}
546
547// ---------------------------------------------------------------------------
548// Secondary clone pass
549// ---------------------------------------------------------------------------
550
551fn remember_repeated_window(
552    repeated_windows: &mut FxHashMap<u64, Vec<Occurrence>>,
553    hash: u64,
554    occurrence: Occurrence,
555    cap: usize,
556) {
557    let bucket = repeated_windows.entry(hash).or_default();
558    if bucket
559        .iter()
560        .any(|s| s.source_id == occurrence.source_id && s.token_start == occurrence.token_start)
561    {
562        return;
563    }
564    if bucket.len() < cap {
565        bucket.push(occurrence);
566    }
567}
568
569fn add_secondary_clones(
570    repeated_windows: FxHashMap<u64, Vec<Occurrence>>,
571    prepared: &[PreparedSource],
572    min_tokens: usize,
573    skip_local: bool,
574    min_lines: usize,
575    clones: &mut Vec<CpdClone>,
576) {
577    if repeated_windows.is_empty() {
578        return;
579    }
580
581    #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
582    struct Candidate {
583        source_a: usize,
584        source_b: usize,
585        token_a: usize,
586        token_b: usize,
587    }
588
589    let mut candidates: Vec<Candidate> = Vec::new();
590    for occurrences in repeated_windows.values() {
591        if occurrences.len() < 2 {
592            continue;
593        }
594        for li in 0..occurrences.len() {
595            for ri in li + 1..occurrences.len() {
596                let left = &occurrences[li];
597                let right = &occurrences[ri];
598                if left.source_id == right.source_id && left.token_start == right.token_start {
599                    continue;
600                }
601                let lh = &prepared[left.source_id].hashes;
602                let rh = &prepared[right.source_id].hashes;
603                let la = left.token_start;
604                let ra = right.token_start;
605                if la + min_tokens > lh.len() || ra + min_tokens > rh.len() {
606                    continue;
607                }
608                if lh[la..la + min_tokens] != rh[ra..ra + min_tokens] {
609                    continue;
610                }
611                let (sa, ta, sb, tb) =
612                    if (left.source_id, left.token_start) <= (right.source_id, right.token_start) {
613                        (
614                            left.source_id,
615                            left.token_start,
616                            right.source_id,
617                            right.token_start,
618                        )
619                    } else {
620                        (
621                            right.source_id,
622                            right.token_start,
623                            left.source_id,
624                            left.token_start,
625                        )
626                    };
627                candidates.push(Candidate {
628                    source_a: sa,
629                    source_b: sb,
630                    token_a: ta,
631                    token_b: tb,
632                });
633            }
634        }
635    }
636    if candidates.is_empty() {
637        return;
638    }
639    candidates.sort_unstable();
640    candidates.dedup();
641
642    // Build line-coverage from already-found primary clones.
643    let n_files = prepared.len();
644    let mut covered: Vec<Vec<(u32, u32)>> = vec![Vec::new(); n_files];
645    for c in clones.iter() {
646        let fa = prepared.iter().position(|p| p.id == c.fragment_a.source_id);
647        let fb = prepared.iter().position(|p| p.id == c.fragment_b.source_id);
648        if let Some(idx) = fa {
649            covered[idx].push((c.fragment_a.start.line, c.fragment_a.end.line));
650        }
651        if let Some(idx) = fb {
652            covered[idx].push((c.fragment_b.start.line, c.fragment_b.end.line));
653        }
654    }
655    for ranges in &mut covered {
656        ranges.sort_unstable();
657    }
658
659    let line_extends_coverage = |file_idx: usize, start: u32, end: u32| -> bool {
660        let ranges = &covered[file_idx];
661        let mut next = start;
662        for &(rs, re) in ranges {
663            if re < next {
664                continue;
665            }
666            if rs > next {
667                return true;
668            }
669            next = next.max(re.saturating_add(1));
670            if next > end {
671                return false;
672            }
673        }
674        next <= end
675    };
676
677    for cand in candidates {
678        // Build the clone using flush_clone's logic (direct fragment construction).
679        let existing = Occurrence {
680            source_id: cand.source_a,
681            token_start: cand.token_a,
682        };
683        let current = Occurrence {
684            source_id: cand.source_b,
685            token_start: cand.token_b,
686        };
687
688        // skip_local check
689        if skip_local {
690            let dir_a = Path::new(&prepared[existing.source_id].id).parent();
691            let dir_b = Path::new(&prepared[current.source_id].id).parent();
692            if dir_a == dir_b {
693                continue;
694            }
695        }
696
697        // Greedy extend.
698        let ex_hashes = &prepared[existing.source_id].hashes;
699        let cur_hashes = &prepared[current.source_id].hashes;
700        let max_extend = (ex_hashes
701            .len()
702            .saturating_sub(existing.token_start + min_tokens))
703        .min(
704            cur_hashes
705                .len()
706                .saturating_sub(current.token_start + min_tokens),
707        );
708        let mut extra = 0usize;
709        while extra < max_extend
710            && ex_hashes[existing.token_start + min_tokens + extra]
711                == cur_hashes[current.token_start + min_tokens + extra]
712        {
713            extra += 1;
714        }
715        let match_len = min_tokens + extra;
716
717        let ex_start = existing.token_start;
718        let ex_end = ex_start + match_len - 1;
719        let cur_start = current.token_start;
720        let cur_end = cur_start + match_len - 1;
721
722        let frag_a = match make_fragment(
723            &prepared[existing.source_id].id,
724            &prepared[existing.source_id].spans,
725            ex_start,
726            ex_end,
727        ) {
728            Some(f) => f,
729            None => continue,
730        };
731        let frag_b = match make_fragment(
732            &prepared[current.source_id].id,
733            &prepared[current.source_id].spans,
734            cur_start,
735            cur_end,
736        ) {
737            Some(f) => f,
738            None => continue,
739        };
740
741        let nc = CpdClone {
742            format: prepared[current.source_id].format.clone(),
743            fragment_a: frag_a,
744            fragment_b: frag_b,
745            token_count: match_len as u32,
746        };
747
748        // min_lines filter for secondary clones too.
749        if min_lines > 0 {
750            let span_a = nc.fragment_a.end.line as i64 - nc.fragment_a.start.line as i64;
751            let span_b = nc.fragment_b.end.line as i64 - nc.fragment_b.start.line as i64;
752            if span_a < min_lines as i64 && span_b < min_lines as i64 {
753                continue;
754            }
755        }
756
757        let extends_a = line_extends_coverage(
758            cand.source_a,
759            nc.fragment_a.start.line,
760            nc.fragment_a.end.line,
761        );
762        let extends_b = line_extends_coverage(
763            cand.source_b,
764            nc.fragment_b.start.line,
765            nc.fragment_b.end.line,
766        );
767        if extends_a || extends_b {
768            clones.push(nc);
769        }
770    }
771}
772
773// ---------------------------------------------------------------------------
774// Tests
775// ---------------------------------------------------------------------------
776
777#[cfg(test)]
778mod tests {
779    use super::*;
780    use crate::models::{Location, Token, TokenKind};
781
782    fn loc(line: u32, col: u32, offset: u32) -> Location {
783        Location {
784            line,
785            column: col,
786            offset,
787        }
788    }
789
790    fn make_token(kind: TokenKind, value: &str, line: u32, col: u32, offset: u32) -> Token {
791        let end_col = col + value.len() as u32;
792        let end_off = offset + value.len() as u32;
793        Token {
794            kind,
795            value: value.to_string(),
796            start: loc(line, col, offset),
797            end: loc(line, end_col, end_off),
798        }
799    }
800
801    fn make_file(id: &str, format: &str, tokens: Vec<Token>) -> SourceFile {
802        SourceFile {
803            id: id.to_string(),
804            format: format.to_string(),
805            tokens,
806        }
807    }
808
809    fn js_tokens_ab() -> Vec<Token> {
810        vec![
811            make_token(TokenKind::Keyword, "function", 1, 0, 0),
812            make_token(TokenKind::Other, "hello", 1, 9, 9),
813            make_token(TokenKind::Operator, "(", 1, 14, 14),
814            make_token(TokenKind::Operator, ")", 1, 15, 15),
815            make_token(TokenKind::Operator, "{", 1, 16, 16),
816            make_token(TokenKind::Keyword, "return", 2, 0, 18),
817            make_token(TokenKind::Literal, "42", 2, 7, 25),
818            make_token(TokenKind::Operator, ";", 2, 9, 27),
819            make_token(TokenKind::Operator, "}", 3, 0, 29),
820        ]
821    }
822
823    #[test]
824    fn empty_input_returns_empty() {
825        let result = detect(&[], 10);
826        assert!(result.is_empty());
827    }
828
829    #[test]
830    fn identical_files_detected_as_clone() {
831        let tokens = js_tokens_ab();
832        let file_a = make_file("a.js", "javascript", tokens.clone());
833        let file_b = make_file("b.js", "javascript", tokens);
834        let clones = detect(&[file_a, file_b], 5);
835        assert!(
836            !clones.is_empty(),
837            "identical files must produce at least one clone"
838        );
839    }
840
841    #[test]
842    fn min_tokens_threshold_respected() {
843        let tokens = js_tokens_ab(); // 9 tokens
844        let file_a = make_file("a.js", "javascript", tokens.clone());
845        let file_b = make_file("b.js", "javascript", tokens);
846        let clones = detect(&[file_a, file_b], 100);
847        assert!(
848            clones.is_empty(),
849            "no clones when min_tokens exceeds file length"
850        );
851    }
852
853    #[test]
854    fn deduplication_ab_ba_collapse() {
855        let tokens = js_tokens_ab();
856        let file_a = make_file("a.js", "javascript", tokens.clone());
857        let file_b = make_file("b.js", "javascript", tokens);
858        let clones = detect(&[file_a, file_b], 5);
859        // Should get exactly 1 clone pair, not 2 symmetric ones.
860        assert_eq!(clones.len(), 1, "symmetric pairs must collapse to 1");
861    }
862
863    #[test]
864    fn different_formats_not_cross_detected() {
865        let tokens = js_tokens_ab();
866        let file_js = make_file("a.js", "javascript", tokens.clone());
867        let file_py = make_file("a.py", "python", tokens);
868        let clones = detect(&[file_js, file_py], 5);
869        assert!(
870            clones.is_empty(),
871            "tokens from different formats must not match"
872        );
873    }
874
875    #[test]
876    fn identical_files_maximal_clone() {
877        // With the open_clone state machine, a single maximal clone is emitted
878        // instead of multiple sliding-window sub-clones.
879        let tokens = js_tokens_ab();
880        let file_a = make_file("a.js", "javascript", tokens.clone());
881        let file_b = make_file("b.js", "javascript", tokens);
882        let clones = detect(&[file_a, file_b], 5);
883        assert_eq!(
884            clones.len(),
885            1,
886            "open_clone SM must produce one maximal clone"
887        );
888        assert_eq!(
889            clones[0].token_count, 9,
890            "maximal clone must cover all 9 tokens"
891        );
892    }
893
894    #[test]
895    fn three_identical_files_secondary_pass_adds_missing_pair() {
896        let tokens = js_tokens_ab();
897        let file_a = make_file("a.js", "javascript", tokens.clone());
898        let file_b = make_file("b.js", "javascript", tokens.clone());
899        let file_c = make_file("c.js", "javascript", tokens);
900        let clones = detect(&[file_a, file_b, file_c], 5);
901        assert!(
902            clones.len() >= 2,
903            "three identical files must yield at least 2 clone pairs, got {}",
904            clones.len()
905        );
906    }
907
908    #[test]
909    fn clones_sorted_by_source_and_line() {
910        let tokens = js_tokens_ab();
911        let file_a = make_file("a.js", "javascript", tokens.clone());
912        let file_b = make_file("b.js", "javascript", tokens);
913        let clones = detect(&[file_a, file_b], 5);
914        for i in 1..clones.len() {
915            let prev = &clones[i - 1];
916            let curr = &clones[i];
917            assert!(
918                (
919                    &prev.fragment_a.source_id,
920                    prev.fragment_a.start.line,
921                    &prev.fragment_b.source_id,
922                    prev.fragment_b.start.line,
923                ) <= (
924                    &curr.fragment_a.source_id,
925                    curr.fragment_a.start.line,
926                    &curr.fragment_b.source_id,
927                    curr.fragment_b.start.line,
928                ),
929                "clones must be sorted"
930            );
931        }
932    }
933}
cpd_core/detect.rs

cpd_core/
detect.rs