cpd_core/
detect.rs

1// detect.rs
2
3use rayon::prelude::*;
4use rustc_hash::{FxHashMap, FxHashSet};
5use std::path::Path;
6
7use crate::{
8    hash::{base_pow, hash_window, roll, token_hash},
9    models::{CpdClone, DetectionToken, Fragment, Location, SourceFile, TokenKind},
10};
11
12// ---------------------------------------------------------------------------
13// Internal store type — replaces the Store trait + MemoryStore
14// ---------------------------------------------------------------------------
15
16/// Window store: maps a window hash to the last seen occurrence.
17/// Type alias — no trait indirection, no vtable, no dyn dispatch.
18type WindowStore = FxHashMap<u64, Occurrence>;
19
20/// Lightweight reference to a window position within a format-group detection call.
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22struct Occurrence {
23    /// Index into the `prepared` array for this `detect_in_group` call.
24    source_id: usize,
25    token_start: usize,
26}
27
28// ---------------------------------------------------------------------------
29// Deduplication key
30// ---------------------------------------------------------------------------
31
32#[derive(Debug, Clone, PartialEq, Eq, Hash)]
33struct CloneDedupKey {
34    a_id: String,
35    a_start_line: u32,
36    b_id: String,
37    b_start_line: u32,
38}
39
40impl CloneDedupKey {
41    fn from_clone(c: &CpdClone) -> Self {
42        // Normalize: smaller (id, line) first so (A,B) and (B,A) map to the same key.
43        let a_key = (&c.fragment_a.source_id, c.fragment_a.start.line);
44        let b_key = (&c.fragment_b.source_id, c.fragment_b.start.line);
45        if a_key <= b_key {
46            Self {
47                a_id: c.fragment_a.source_id.clone(),
48                a_start_line: c.fragment_a.start.line,
49                b_id: c.fragment_b.source_id.clone(),
50                b_start_line: c.fragment_b.start.line,
51            }
52        } else {
53            Self {
54                a_id: c.fragment_b.source_id.clone(),
55                a_start_line: c.fragment_b.start.line,
56                b_id: c.fragment_a.source_id.clone(),
57                b_start_line: c.fragment_a.start.line,
58            }
59        }
60    }
61}
62
63// ---------------------------------------------------------------------------
64// Public API — SourceFile path (for backward compat with tests)
65// ---------------------------------------------------------------------------
66
67/// Detect duplicate code clones across `files` using a rolling-hash sliding window.
68///
69/// Files are grouped by format; each format group is processed independently.
70/// Rayon is used for outer parallelism (one task per format group).
71pub fn detect(files: &[SourceFile], min_tokens: usize) -> Vec<CpdClone> {
72    detect_with_options(files, min_tokens, false, 0)
73}
74
75/// Detect clones with extended options.
76///
77/// - `skip_local`: skip clone pairs where both fragments share the same parent directory.
78/// - `min_lines`: reject clones whose fragment line span is shorter than this.
79///   The line span is `end.line - start.line`; a clone is kept only if this value
80///   is >= `min_lines`. This mirrors jscpd's `LinesLengthCloneValidator`.
81pub fn detect_with_options(
82    files: &[SourceFile],
83    min_tokens: usize,
84    skip_local: bool,
85    min_lines: usize,
86) -> Vec<CpdClone> {
87    if files.is_empty() || min_tokens == 0 {
88        return vec![];
89    }
90
91    // Group files by format. Sort for deterministic order.
92    let mut by_format: FxHashMap<&str, Vec<usize>> = FxHashMap::default();
93    for (idx, file) in files.iter().enumerate() {
94        by_format.entry(file.format.as_str()).or_default().push(idx);
95    }
96    let mut format_groups: Vec<(&str, Vec<usize>)> = by_format.into_iter().collect();
97    format_groups.sort_unstable_by_key(|(fmt, _)| *fmt);
98    for (_, group) in &mut format_groups {
99        group.sort_unstable_by_key(|&idx| files[idx].id.as_str());
100    }
101
102    let all_clones: Vec<Vec<CpdClone>> = format_groups
103        .into_par_iter()
104        .map(|(_format, indices)| {
105            // Build per-group prepared data from SourceFile.tokens.
106            // This is the backward-compat path; orchestrate.rs uses
107            // detect_prepared() directly to avoid re-hashing.
108            let prepared: Vec<PreparedSource> = indices
109                .iter()
110                .map(|&idx| {
111                    let file = &files[idx];
112                    let mut hashes = Vec::with_capacity(file.tokens.len());
113                    let mut spans: Vec<(Location, Location)> =
114                        Vec::with_capacity(file.tokens.len());
115                    for t in &file.tokens {
116                        if t.kind == TokenKind::Ignore {
117                            continue;
118                        }
119                        hashes.push(token_hash(t.kind.discriminant(), &t.value));
120                        spans.push((t.start.clone(), t.end.clone()));
121                    }
122                    PreparedSource {
123                        id: file.id.clone(),
124                        format: file.format.clone(),
125                        hashes,
126                        spans,
127                    }
128                })
129                .collect();
130            detect_in_group(&prepared, min_tokens, skip_local, min_lines)
131        })
132        .collect();
133
134    let mut clones: Vec<CpdClone> = all_clones.into_iter().flatten().collect();
135    dedup_exact_clones(&mut clones);
136
137    clones.sort_by(|a, b| {
138        (
139            &a.fragment_a.source_id,
140            a.fragment_a.start.line,
141            &a.fragment_b.source_id,
142            a.fragment_b.start.line,
143        )
144            .cmp(&(
145                &b.fragment_a.source_id,
146                b.fragment_a.start.line,
147                &b.fragment_b.source_id,
148                b.fragment_b.start.line,
149            ))
150    });
151
152    clones
153}
154
155// ---------------------------------------------------------------------------
156// Direct DetectionToken path (called by orchestrate.rs)
157// ---------------------------------------------------------------------------
158
159/// A file ready for detection: pre-hashed, pre-filtered.
160///
161/// Produced either from `SourceFile.tokens` (backward compat) or directly from
162/// `tokenize_to_detection` output (fast path used by orchestrate.rs).
163pub struct PreparedSource {
164    pub id: String,
165    pub format: String,
166    pub hashes: Vec<u64>,
167    pub spans: Vec<(Location, Location)>,
168}
169
170impl PreparedSource {
171    /// Build from a `DetectionToken` slice — the fast path.
172    pub fn from_detection_tokens(id: String, format: String, tokens: &[DetectionToken]) -> Self {
173        let mut hashes = Vec::with_capacity(tokens.len());
174        let mut spans = Vec::with_capacity(tokens.len());
175        for t in tokens {
176            hashes.push(t.hash);
177            spans.push((t.start.clone(), t.end.clone()));
178        }
179        Self {
180            id,
181            format,
182            hashes,
183            spans,
184        }
185    }
186}
187
188/// Detect clones from pre-prepared sources grouped by format.
189///
190/// Called by orchestrate.rs after `tokenize_to_detection` — skips re-hashing.
191pub fn detect_prepared(
192    format_groups: Vec<Vec<PreparedSource>>,
193    min_tokens: usize,
194    skip_local: bool,
195    min_lines: usize,
196) -> Vec<CpdClone> {
197    if format_groups.is_empty() || min_tokens == 0 {
198        return vec![];
199    }
200
201    let all_clones: Vec<Vec<CpdClone>> = format_groups
202        .into_par_iter()
203        .map(|group| detect_in_group(&group, min_tokens, skip_local, min_lines))
204        .collect();
205
206    let mut clones: Vec<CpdClone> = all_clones.into_iter().flatten().collect();
207    dedup_exact_clones(&mut clones);
208
209    clones.sort_by(|a, b| {
210        (
211            &a.fragment_a.source_id,
212            a.fragment_a.start.line,
213            &a.fragment_b.source_id,
214            a.fragment_b.start.line,
215        )
216            .cmp(&(
217                &b.fragment_a.source_id,
218                b.fragment_a.start.line,
219                &b.fragment_b.source_id,
220                b.fragment_b.start.line,
221            ))
222    });
223
224    clones
225}
226
227// ---------------------------------------------------------------------------
228// Core detection — per format group
229// ---------------------------------------------------------------------------
230
231fn detect_in_group(
232    prepared: &[PreparedSource],
233    min_tokens: usize,
234    skip_local: bool,
235    min_lines: usize,
236) -> Vec<CpdClone> {
237    // Precompute window_power once for this format group.
238    // If per-language min_tokens is introduced, recompute per group (it is already scoped here).
239    let window_power = base_pow(min_tokens.saturating_sub(1));
240
241    // Pre-allocate store capacity to avoid FxHashMap rehashing.
242    let total_windows: usize = prepared
243        .iter()
244        .map(|p| p.hashes.len().saturating_sub(min_tokens))
245        .sum();
246    let mut store: WindowStore =
247        FxHashMap::with_capacity_and_hasher(total_windows, Default::default());
248
249    let mut clones: Vec<CpdClone> = Vec::new();
250    // Cap repeated-window occurrences per hash. Higher values find more clone pairs
251    // among 3+ similar files (e.g., file_1.js, file_1.mjs, file_1.cjs) but use more memory.
252    // The TypeScript jscpd compares all file pairs, so we raise this to match its coverage.
253    const SECONDARY_OCCURRENCE_CAP: usize = 2;
254    let mut repeated_windows: FxHashMap<u64, Vec<Occurrence>> = FxHashMap::default();
255
256    for (file_idx, source) in prepared.iter().enumerate() {
257        let hashes = &source.hashes;
258        if hashes.len() < min_tokens {
259            continue;
260        }
261        let windows_len = hashes.len() - min_tokens + 1;
262
263        // open_clone state machine: replaces emit-every-window + suppress_subclones.
264        // A clone is opened when a matching window is found and enlarged as long as
265        // subsequent windows also match. flush_clone is called when the match breaks
266        // or the file scan ends — only one clone per contiguous matching region.
267        let mut open_clone: Option<OpenClone> = None;
268
269        let mut window_hash = hash_window(&hashes[..min_tokens]);
270
271        for token_start in 0..windows_len {
272            if token_start > 0 {
273                window_hash = roll(
274                    window_hash,
275                    hashes[token_start - 1],
276                    hashes[token_start + min_tokens - 1],
277                    window_power,
278                );
279            }
280
281            let current = Occurrence {
282                source_id: file_idx,
283                token_start,
284            };
285
286            match store.get(&window_hash).copied() {
287                Some(stored) if windows_match(stored, current, prepared, min_tokens) => {
288                    if open_clone.is_none() {
289                        open_clone = Some(OpenClone {
290                            stored_occurrence: stored,
291                            current_start: token_start,
292                            match_len: min_tokens,
293                        });
294                    } else if let Some(ref mut oc) = open_clone {
295                        // Enlarge: the next window also matches — extend by one token.
296                        oc.match_len += 1;
297                    }
298                    remember_repeated_window(
299                        &mut repeated_windows,
300                        window_hash,
301                        stored,
302                        SECONDARY_OCCURRENCE_CAP,
303                    );
304                    remember_repeated_window(
305                        &mut repeated_windows,
306                        window_hash,
307                        current,
308                        SECONDARY_OCCURRENCE_CAP,
309                    );
310                    // Do NOT update store — keep the first occurrence so the enlargement
311                    // stays consistent across the contiguous match region.
312                }
313                _ => {
314                    // Match broke (or no entry). Flush whatever was open.
315                    flush_clone(
316                        open_clone.take(),
317                        file_idx,
318                        prepared,
319                        skip_local,
320                        min_lines,
321                        &mut clones,
322                    );
323                    store.insert(window_hash, current);
324                }
325            }
326        }
327
328        // Flush any open clone at the end of the file scan.
329        flush_clone(
330            open_clone.take(),
331            file_idx,
332            prepared,
333            skip_local,
334            min_lines,
335            &mut clones,
336        );
337    }
338
339    add_secondary_clones(
340        repeated_windows,
341        prepared,
342        min_tokens,
343        skip_local,
344        min_lines,
345        &mut clones,
346    );
347
348    clones
349}
350
351// ---------------------------------------------------------------------------
352// Open clone state machine helpers
353// ---------------------------------------------------------------------------
354
355struct OpenClone {
356    stored_occurrence: Occurrence,
357    current_start: usize,
358    match_len: usize,
359}
360
361/// Returns true if the window at `current` actually matches the window at `stored`
362/// (hash match is necessary but not sufficient — verify token equality).
363fn windows_match(
364    stored: Occurrence,
365    current: Occurrence,
366    prepared: &[PreparedSource],
367    min_tokens: usize,
368) -> bool {
369    if stored.source_id == current.source_id && stored.token_start == current.token_start {
370        return false;
371    }
372    let stored_hashes = &prepared[stored.source_id].hashes;
373    let current_hashes = &prepared[current.source_id].hashes;
374    if stored.token_start + min_tokens > stored_hashes.len()
375        || current.token_start + min_tokens > current_hashes.len()
376    {
377        return false;
378    }
379    stored_hashes[stored.token_start..stored.token_start + min_tokens]
380        == current_hashes[current.token_start..current.token_start + min_tokens]
381}
382
383/// Flush an open clone to the clones list.
384///
385/// A clone is rejected if its line span is shorter than `min_lines`.
386/// The line span is measured as `end.line - start.line` (which equals
387/// `number_of_lines - 1`). Mirrors jscpd's `LinesLengthCloneValidator`.
388fn flush_clone(
389    open: Option<OpenClone>,
390    current_file_idx: usize,
391    prepared: &[PreparedSource],
392    skip_local: bool,
393    min_lines: usize,
394    clones: &mut Vec<CpdClone>,
395) {
396    let oc = match open {
397        Some(o) => o,
398        None => return,
399    };
400
401    let existing = &oc.stored_occurrence;
402    let cur_start = oc.current_start;
403    let match_len = oc.match_len;
404
405    let existing_file = &prepared[existing.source_id];
406    let current_file = &prepared[current_file_idx];
407
408    let ex_start = existing.token_start;
409    let ex_end = ex_start + match_len - 1;
410    let cur_end = cur_start + match_len - 1;
411
412    // skip_local: drop clone pairs where both fragments share the same parent directory.
413    if skip_local {
414        let dir_a = Path::new(&existing_file.id).parent();
415        let dir_b = Path::new(&current_file.id).parent();
416        if dir_a == dir_b {
417            return;
418        }
419    }
420
421    let fragment_a = match make_fragment(&existing_file.id, &existing_file.spans, ex_start, ex_end)
422    {
423        Some(f) => f,
424        None => return,
425    };
426    let fragment_b = match make_fragment(&current_file.id, &current_file.spans, cur_start, cur_end)
427    {
428        Some(f) => f,
429        None => return,
430    };
431
432    // min_lines filter: reject clones whose fragment A line span is shorter than min_lines.
433    // Mirrors jscpd's LinesLengthCloneValidator which checks only duplicationA:
434    //   duplicationA.end.line - duplicationA.start.line >= minLines
435    if min_lines > 0 {
436        let lines = fragment_a.end.line as usize - fragment_a.start.line as usize;
437        if lines < min_lines {
438            return;
439        }
440    }
441
442    clones.push(CpdClone {
443        format: current_file.format.clone(),
444        fragment_a,
445        fragment_b,
446        token_count: match_len as u32,
447    });
448}
449
450fn make_fragment(
451    source_id: &str,
452    spans: &[(Location, Location)],
453    start_idx: usize,
454    end_idx: usize,
455) -> Option<Fragment> {
456    let (first_start, _) = spans.get(start_idx)?;
457    let (_, last_end) = spans.get(end_idx)?;
458    Some(Fragment {
459        source_id: source_id.to_string(),
460        start: first_start.clone(),
461        end: last_end.clone(),
462        range: [start_idx as u32, end_idx as u32],
463        blame: None,
464    })
465}
466
467// ---------------------------------------------------------------------------
468// Deduplication — O(n) FxHashSet + sub-clone suppression
469// ---------------------------------------------------------------------------
470
471fn dedup_exact_clones(clones: &mut Vec<CpdClone>) {
472    // Normalize each clone so fragment_a <= fragment_b (by id then start line).
473    for clone in clones.iter_mut() {
474        let a_key = (&clone.fragment_a.source_id, clone.fragment_a.start.line);
475        let b_key = (&clone.fragment_b.source_id, clone.fragment_b.start.line);
476        if a_key > b_key {
477            std::mem::swap(&mut clone.fragment_a, &mut clone.fragment_b);
478        }
479    }
480
481    let mut seen: FxHashSet<CloneDedupKey> = FxHashSet::default();
482    clones.retain(|c| seen.insert(CloneDedupKey::from_clone(c)));
483}
484
485// ---------------------------------------------------------------------------
486// Secondary clone pass
487// ---------------------------------------------------------------------------
488
489fn remember_repeated_window(
490    repeated_windows: &mut FxHashMap<u64, Vec<Occurrence>>,
491    hash: u64,
492    occurrence: Occurrence,
493    cap: usize,
494) {
495    let bucket = repeated_windows.entry(hash).or_default();
496    if bucket
497        .iter()
498        .any(|s| s.source_id == occurrence.source_id && s.token_start == occurrence.token_start)
499    {
500        return;
501    }
502    if bucket.len() < cap {
503        bucket.push(occurrence);
504    }
505}
506
507struct SecondaryOpen {
508    clone: CpdClone,
509    source_a: usize,
510    source_b: usize,
511    last_token_start_a: usize,
512    last_token_start_b: usize,
513}
514
515fn add_secondary_clones(
516    repeated_windows: FxHashMap<u64, Vec<Occurrence>>,
517    prepared: &[PreparedSource],
518    min_tokens: usize,
519    skip_local: bool,
520    min_lines: usize,
521    clones: &mut Vec<CpdClone>,
522) {
523    if repeated_windows.is_empty() {
524        return;
525    }
526
527    #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
528    struct Candidate {
529        source_a: usize,
530        source_b: usize,
531        token_a: usize,
532        token_b: usize,
533    }
534
535    let mut candidates: Vec<Candidate> = Vec::new();
536    for occurrences in repeated_windows.values() {
537        if occurrences.len() < 2 {
538            continue;
539        }
540        for li in 0..occurrences.len() {
541            for ri in li + 1..occurrences.len() {
542                let left = &occurrences[li];
543                let right = &occurrences[ri];
544                if left.source_id == right.source_id && left.token_start == right.token_start {
545                    continue;
546                }
547                let lh = &prepared[left.source_id].hashes;
548                let rh = &prepared[right.source_id].hashes;
549                let la = left.token_start;
550                let ra = right.token_start;
551                if la + min_tokens > lh.len() || ra + min_tokens > rh.len() {
552                    continue;
553                }
554                if lh[la..la + min_tokens] != rh[ra..ra + min_tokens] {
555                    continue;
556                }
557                let (sa, ta, sb, tb) =
558                    if (left.source_id, left.token_start) <= (right.source_id, right.token_start) {
559                        (
560                            left.source_id,
561                            left.token_start,
562                            right.source_id,
563                            right.token_start,
564                        )
565                    } else {
566                        (
567                            right.source_id,
568                            right.token_start,
569                            left.source_id,
570                            left.token_start,
571                        )
572                    };
573                candidates.push(Candidate {
574                    source_a: sa,
575                    source_b: sb,
576                    token_a: ta,
577                    token_b: tb,
578                });
579            }
580        }
581    }
582    if candidates.is_empty() {
583        return;
584    }
585    candidates.sort_unstable();
586    candidates.dedup();
587
588    // Build line-coverage from already-found primary clones.
589    let mut coverage = LineCoverage::from_clones(prepared, clones);
590    let mut open: Option<SecondaryOpen> = None;
591
592    for candidate in candidates {
593        if let Some(current) = open.as_mut()
594            && current.source_a == candidate.source_a
595            && current.source_b == candidate.source_b
596            && current.last_token_start_a + 1 == candidate.token_a
597            && current.last_token_start_b + 1 == candidate.token_b
598        {
599            // Enlarge: extend the open secondary clone by one token on each side.
600            let new_match_len = current.clone.token_count as usize + 1;
601            let end_idx_a = candidate.token_a + min_tokens;
602            let end_idx_b = candidate.token_b + min_tokens;
603            if let Some(frag_a_end) = prepared[current.source_a].spans.get(end_idx_a) {
604                current.clone.fragment_a.end = frag_a_end.1.clone();
605                current.clone.fragment_a.range[1] = end_idx_a as u32;
606            }
607            if let Some(frag_b_end) = prepared[current.source_b].spans.get(end_idx_b) {
608                current.clone.fragment_b.end = frag_b_end.1.clone();
609                current.clone.fragment_b.range[1] = end_idx_b as u32;
610            }
611            current.clone.token_count = new_match_len as u32;
612            current.last_token_start_a = candidate.token_a;
613            current.last_token_start_b = candidate.token_b;
614            continue;
615        }
616
617        flush_secondary_clone(
618            open.take(),
619            prepared,
620            skip_local,
621            min_lines,
622            clones,
623            &mut coverage,
624        );
625
626        // Create a new secondary clone candidate.
627        let start_a = candidate.token_a;
628        let end_a = start_a + min_tokens - 1;
629        let start_b = candidate.token_b;
630        let end_b = start_b + min_tokens - 1;
631
632        let frag_a = match make_fragment(
633            &prepared[candidate.source_a].id,
634            &prepared[candidate.source_a].spans,
635            start_a,
636            end_a,
637        ) {
638            Some(f) => f,
639            None => continue,
640        };
641        let frag_b = match make_fragment(
642            &prepared[candidate.source_b].id,
643            &prepared[candidate.source_b].spans,
644            start_b,
645            end_b,
646        ) {
647            Some(f) => f,
648            None => continue,
649        };
650
651        open = Some(SecondaryOpen {
652            clone: CpdClone {
653                format: prepared[candidate.source_a].format.clone(),
654                fragment_a: frag_a,
655                fragment_b: frag_b,
656                token_count: min_tokens as u32,
657            },
658            source_a: candidate.source_a,
659            source_b: candidate.source_b,
660            last_token_start_a: candidate.token_a,
661            last_token_start_b: candidate.token_b,
662        });
663    }
664
665    flush_secondary_clone(
666        open.take(),
667        prepared,
668        skip_local,
669        min_lines,
670        clones,
671        &mut coverage,
672    );
673}
674
675fn flush_secondary_clone(
676    open: Option<SecondaryOpen>,
677    prepared: &[PreparedSource],
678    skip_local: bool,
679    min_lines: usize,
680    clones: &mut Vec<CpdClone>,
681    coverage: &mut LineCoverage,
682) {
683    let Some(oc) = open else {
684        return;
685    };
686
687    let range_a = fragment_line_range(&oc.clone.fragment_a);
688    let range_b = fragment_line_range(&oc.clone.fragment_b);
689
690    // skip_local check
691    if skip_local {
692        let dir_a = Path::new(&prepared[oc.source_a].id).parent();
693        let dir_b = Path::new(&prepared[oc.source_b].id).parent();
694        if dir_a == dir_b {
695            return;
696        }
697    }
698
699    // min_lines filter: only check fragment A, mirroring jscpd's LinesLengthCloneValidator.
700    if min_lines > 0 {
701        let lines = oc.clone.fragment_a.end.line as usize - oc.clone.fragment_a.start.line as usize;
702        if lines < min_lines {
703            return;
704        }
705    }
706
707    // Line-coverage filter: skip secondary clones that don't extend existing coverage
708    // on either side.  This prevents the report from filling up with dozens of
709    // overlapping sub-clones of the same region.
710    if !coverage.extends(oc.source_a, range_a) || !coverage.extends(oc.source_b, range_b) {
711        return;
712    }
713
714    let before = clones.len();
715    clones.push(oc.clone);
716
717    // Insert coverage for newly added clone.
718    if clones.len() > before {
719        coverage.insert(oc.source_a, range_a);
720        coverage.insert(oc.source_b, range_b);
721    }
722}
723
724fn fragment_line_range(fragment: &Fragment) -> (usize, usize) {
725    let start = fragment.start.line as usize;
726    let end = fragment.end.line as usize;
727    (start.min(end), start.max(end))
728}
729
730// ---------------------------------------------------------------------------
731// Line coverage tracking for secondary clones
732// ---------------------------------------------------------------------------
733
734struct LineCoverage {
735    ranges_by_source: Vec<Vec<(usize, usize)>>,
736}
737
738impl LineCoverage {
739    fn from_clones(prepared: &[PreparedSource], clones: &[CpdClone]) -> Self {
740        let mut source_lookup: FxHashMap<&str, usize> = FxHashMap::default();
741        for (idx, source) in prepared.iter().enumerate() {
742            source_lookup.insert(source.id.as_str(), idx);
743        }
744        let mut coverage = Self {
745            ranges_by_source: vec![Vec::new(); prepared.len()],
746        };
747        for clone in clones {
748            if let Some(idx) = source_lookup.get(clone.fragment_a.source_id.as_str()) {
749                coverage.insert(*idx, fragment_line_range(&clone.fragment_a));
750            }
751            if let Some(idx) = source_lookup.get(clone.fragment_b.source_id.as_str()) {
752                coverage.insert(*idx, fragment_line_range(&clone.fragment_b));
753            }
754        }
755        coverage
756    }
757
758    fn extends(&self, source_idx: usize, range: (usize, usize)) -> bool {
759        let Some(ranges) = self.ranges_by_source.get(source_idx) else {
760            return true;
761        };
762        let mut next_line = range.0;
763        for &(start, end) in ranges {
764            if end < next_line {
765                continue;
766            }
767            if start > next_line {
768                return true;
769            }
770            next_line = next_line.max(end.saturating_add(1));
771            if next_line > range.1 {
772                return false;
773            }
774        }
775        next_line <= range.1
776    }
777
778    fn insert(&mut self, source_idx: usize, range: (usize, usize)) {
779        let Some(ranges) = self.ranges_by_source.get_mut(source_idx) else {
780            return;
781        };
782        ranges.push(range);
783        ranges.sort_unstable();
784
785        let mut merged: Vec<(usize, usize)> = Vec::with_capacity(ranges.len());
786        for &(start, end) in ranges.iter() {
787            if let Some((_, previous_end)) = merged.last_mut()
788                && start <= previous_end.saturating_add(1)
789            {
790                *previous_end = (*previous_end).max(end);
791                continue;
792            }
793            merged.push((start, end));
794        }
795        *ranges = merged;
796    }
797}
798
799// ---------------------------------------------------------------------------
800// Tests
801// ---------------------------------------------------------------------------
802
803#[cfg(test)]
804mod tests {
805    use super::*;
806    use crate::models::{Location, Token, TokenKind};
807
808    fn loc(line: u32, col: u32, offset: u32) -> Location {
809        Location {
810            line,
811            column: col,
812            offset,
813        }
814    }
815
816    fn make_token(kind: TokenKind, value: &str, line: u32, col: u32, offset: u32) -> Token {
817        let end_col = col + value.len() as u32;
818        let end_off = offset + value.len() as u32;
819        Token {
820            kind,
821            value: value.to_string(),
822            start: loc(line, col, offset),
823            end: loc(line, end_col, end_off),
824        }
825    }
826
827    fn make_file(id: &str, format: &str, tokens: Vec<Token>) -> SourceFile {
828        SourceFile {
829            id: id.to_string(),
830            format: format.to_string(),
831            tokens,
832        }
833    }
834
835    fn js_tokens_ab() -> Vec<Token> {
836        vec![
837            make_token(TokenKind::Keyword, "function", 1, 0, 0),
838            make_token(TokenKind::Other, "hello", 1, 9, 9),
839            make_token(TokenKind::Operator, "(", 1, 14, 14),
840            make_token(TokenKind::Operator, ")", 1, 15, 15),
841            make_token(TokenKind::Operator, "{", 1, 16, 16),
842            make_token(TokenKind::Keyword, "return", 2, 0, 18),
843            make_token(TokenKind::Literal, "42", 2, 7, 25),
844            make_token(TokenKind::Operator, ";", 2, 9, 27),
845            make_token(TokenKind::Operator, "}", 3, 0, 29),
846        ]
847    }
848
849    #[test]
850    fn empty_input_returns_empty() {
851        let result = detect(&[], 10);
852        assert!(result.is_empty());
853    }
854
855    #[test]
856    fn identical_files_detected_as_clone() {
857        let tokens = js_tokens_ab();
858        let file_a = make_file("a.js", "javascript", tokens.clone());
859        let file_b = make_file("b.js", "javascript", tokens);
860        let clones = detect(&[file_a, file_b], 5);
861        assert!(
862            !clones.is_empty(),
863            "identical files must produce at least one clone"
864        );
865    }
866
867    #[test]
868    fn min_tokens_threshold_respected() {
869        let tokens = js_tokens_ab(); // 9 tokens
870        let file_a = make_file("a.js", "javascript", tokens.clone());
871        let file_b = make_file("b.js", "javascript", tokens);
872        let clones = detect(&[file_a, file_b], 100);
873        assert!(
874            clones.is_empty(),
875            "no clones when min_tokens exceeds file length"
876        );
877    }
878
879    #[test]
880    fn deduplication_ab_ba_collapse() {
881        let tokens = js_tokens_ab();
882        let file_a = make_file("a.js", "javascript", tokens.clone());
883        let file_b = make_file("b.js", "javascript", tokens);
884        let clones = detect(&[file_a, file_b], 5);
885        // Should get exactly 1 clone pair, not 2 symmetric ones.
886        assert_eq!(clones.len(), 1, "symmetric pairs must collapse to 1");
887    }
888
889    #[test]
890    fn different_formats_not_cross_detected() {
891        let tokens = js_tokens_ab();
892        let file_js = make_file("a.js", "javascript", tokens.clone());
893        let file_py = make_file("a.py", "python", tokens);
894        let clones = detect(&[file_js, file_py], 5);
895        assert!(
896            clones.is_empty(),
897            "tokens from different formats must not match"
898        );
899    }
900
901    #[test]
902    fn identical_files_maximal_clone() {
903        // With the open_clone state machine, a single maximal clone is emitted
904        // instead of multiple sliding-window sub-clones.
905        let tokens = js_tokens_ab();
906        let file_a = make_file("a.js", "javascript", tokens.clone());
907        let file_b = make_file("b.js", "javascript", tokens);
908        let clones = detect(&[file_a, file_b], 5);
909        assert_eq!(
910            clones.len(),
911            1,
912            "open_clone SM must produce one maximal clone"
913        );
914        assert_eq!(
915            clones[0].token_count, 9,
916            "maximal clone must cover all 9 tokens"
917        );
918    }
919
920    #[test]
921    fn three_identical_files_secondary_pass_adds_missing_pair() {
922        let tokens = js_tokens_ab();
923        let file_a = make_file("a.js", "javascript", tokens.clone());
924        let file_b = make_file("b.js", "javascript", tokens.clone());
925        let file_c = make_file("c.js", "javascript", tokens);
926        let clones = detect(&[file_a, file_b, file_c], 5);
927        assert!(
928            clones.len() >= 2,
929            "three identical files must yield at least 2 clone pairs, got {}",
930            clones.len()
931        );
932    }
933
934    #[test]
935    fn clones_sorted_by_source_and_line() {
936        let tokens = js_tokens_ab();
937        let file_a = make_file("a.js", "javascript", tokens.clone());
938        let file_b = make_file("b.js", "javascript", tokens);
939        let clones = detect(&[file_a, file_b], 5);
940        for i in 1..clones.len() {
941            let prev = &clones[i - 1];
942            let curr = &clones[i];
943            assert!(
944                (
945                    &prev.fragment_a.source_id,
946                    prev.fragment_a.start.line,
947                    &prev.fragment_b.source_id,
948                    prev.fragment_b.start.line,
949                ) <= (
950                    &curr.fragment_a.source_id,
951                    curr.fragment_a.start.line,
952                    &curr.fragment_b.source_id,
953                    curr.fragment_b.start.line,
954                ),
955                "clones must be sorted"
956            );
957        }
958    }
959}
cpd_core/detect.rs

cpd_core/
detect.rs