cpd_core/
detect.rs

1// detect.rs
2
3use rayon::prelude::*;
4use rustc_hash::{FxHashMap, FxHashSet};
5use std::path::Path;
6
7use crate::{
8    hash::{base_pow, hash_window, roll, token_hash},
9    models::{CpdClone, DetectionToken, Fragment, Location, SourceFile, TokenKind},
10};
11
12// ---------------------------------------------------------------------------
13// Internal store type — replaces the Store trait + MemoryStore
14// ---------------------------------------------------------------------------
15
16/// Window store: maps a window hash to the last seen occurrence.
17/// Type alias — no trait indirection, no vtable, no dyn dispatch.
18type WindowStore = FxHashMap<u64, Occurrence>;
19
20/// Lightweight reference to a window position within a format-group detection call.
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22struct Occurrence {
23    /// Index into the `prepared` array for this `detect_in_group` call.
24    source_id: usize,
25    token_start: usize,
26}
27
28// ---------------------------------------------------------------------------
29// Deduplication key
30// ---------------------------------------------------------------------------
31
32#[derive(Debug, Clone, PartialEq, Eq, Hash)]
33struct CloneDedupKey {
34    a_id: String,
35    a_start_line: u32,
36    b_id: String,
37    b_start_line: u32,
38}
39
40impl CloneDedupKey {
41    fn from_clone(c: &CpdClone) -> Self {
42        // Normalize: smaller (id, line) first so (A,B) and (B,A) map to the same key.
43        let a_key = (&c.fragment_a.source_id, c.fragment_a.start.line);
44        let b_key = (&c.fragment_b.source_id, c.fragment_b.start.line);
45        if a_key <= b_key {
46            Self {
47                a_id: c.fragment_a.source_id.clone(),
48                a_start_line: c.fragment_a.start.line,
49                b_id: c.fragment_b.source_id.clone(),
50                b_start_line: c.fragment_b.start.line,
51            }
52        } else {
53            Self {
54                a_id: c.fragment_b.source_id.clone(),
55                a_start_line: c.fragment_b.start.line,
56                b_id: c.fragment_a.source_id.clone(),
57                b_start_line: c.fragment_a.start.line,
58            }
59        }
60    }
61}
62
63// ---------------------------------------------------------------------------
64// Public API — SourceFile path (for backward compat with tests)
65// ---------------------------------------------------------------------------
66
67/// Detect duplicate code clones across `files` using a rolling-hash sliding window.
68///
69/// Files are grouped by format; each format group is processed independently.
70/// Rayon is used for outer parallelism (one task per format group).
71pub fn detect(files: &[SourceFile], min_tokens: usize) -> Vec<CpdClone> {
72    detect_with_options(files, min_tokens, false, 0)
73}
74
75/// Detect clones with extended options.
76///
77/// - `skip_local`: skip clone pairs where both fragments share the same parent directory.
78/// - `min_lines`: reject clones whose fragment line span is shorter than this.
79///   The line span is `end.line - start.line`; a clone is kept only if this value
80///   is >= `min_lines`. This mirrors jscpd's `LinesLengthCloneValidator`.
81pub fn detect_with_options(
82    files: &[SourceFile],
83    min_tokens: usize,
84    skip_local: bool,
85    min_lines: usize,
86) -> Vec<CpdClone> {
87    if files.is_empty() || min_tokens == 0 {
88        return vec![];
89    }
90
91    // Group files by format. Sort for deterministic order.
92    let mut by_format: FxHashMap<&str, Vec<&SourceFile>> = FxHashMap::default();
93    for file in files {
94        by_format.entry(file.format.as_str()).or_default().push(file);
95    }
96    let mut format_groups: Vec<(&str, Vec<&SourceFile>)> = by_format.into_iter().collect();
97    format_groups.sort_unstable_by_key(|(fmt, _)| *fmt);
98    for (_, group) in &mut format_groups {
99        group.sort_unstable_by_key(|&file| file.id.as_str());
100    }
101
102    let mut clones: Vec<CpdClone> = format_groups
103        .into_par_iter()
104        .flat_map(|(_format, files)| {
105            // Build per-group prepared data from SourceFile.tokens.
106            // This is the backward-compat path; orchestrate.rs uses
107            // detect_prepared() directly to avoid re-hashing.
108            let prepared: Vec<PreparedSource> = files
109                .into_iter()
110                .map(|file| {
111                    let mut hashes = Vec::with_capacity(file.tokens.len());
112                    let mut spans: Vec<(Location, Location)> =
113                        Vec::with_capacity(file.tokens.len());
114                    for t in &file.tokens {
115                        if t.kind == TokenKind::Ignore {
116                            continue;
117                        }
118                        hashes.push(token_hash(t.kind.discriminant(), &t.value));
119                        spans.push((t.start.clone(), t.end.clone()));
120                    }
121                    PreparedSource {
122                        id: file.id.clone(),
123                        format: file.format.clone(),
124                        hashes,
125                        spans,
126                    }
127                })
128                .collect();
129            detect_in_group(&prepared, min_tokens, skip_local, min_lines)
130        })
131        .collect();
132
133    dedup_exact_clones(&mut clones);
134
135    clones.sort_by(|a, b| {
136        (
137            &a.fragment_a.source_id,
138            a.fragment_a.start.line,
139            &a.fragment_b.source_id,
140            a.fragment_b.start.line,
141        )
142            .cmp(&(
143                &b.fragment_a.source_id,
144                b.fragment_a.start.line,
145                &b.fragment_b.source_id,
146                b.fragment_b.start.line,
147            ))
148    });
149
150    clones
151}
152
153// ---------------------------------------------------------------------------
154// Direct DetectionToken path (called by orchestrate.rs)
155// ---------------------------------------------------------------------------
156
157/// A file ready for detection: pre-hashed, pre-filtered.
158///
159/// Produced either from `SourceFile.tokens` (backward compat) or directly from
160/// `tokenize_to_detection` output (fast path used by orchestrate.rs).
161pub struct PreparedSource {
162    pub id: String,
163    pub format: String,
164    pub hashes: Vec<u64>,
165    pub spans: Vec<(Location, Location)>,
166}
167
168impl PreparedSource {
169    /// Build from a `DetectionToken` slice — the fast path.
170    pub fn from_detection_tokens(id: String, format: String, tokens: &[DetectionToken]) -> Self {
171        let mut hashes = Vec::with_capacity(tokens.len());
172        let mut spans = Vec::with_capacity(tokens.len());
173        for t in tokens {
174            hashes.push(t.hash);
175            spans.push((t.start.clone(), t.end.clone()));
176        }
177        Self {
178            id,
179            format,
180            hashes,
181            spans,
182        }
183    }
184}
185
186/// Detect clones from pre-prepared sources grouped by format.
187///
188/// Called by orchestrate.rs after `tokenize_to_detection` — skips re-hashing.
189pub fn detect_prepared(
190    format_groups: Vec<Vec<PreparedSource>>,
191    min_tokens: usize,
192    skip_local: bool,
193    min_lines: usize,
194) -> Vec<CpdClone> {
195    if format_groups.is_empty() || min_tokens == 0 {
196        return vec![];
197    }
198
199    let mut clones: Vec<CpdClone> = format_groups
200        .into_par_iter()
201        .flat_map(|group| detect_in_group(&group, min_tokens, skip_local, min_lines))
202        .collect();
203
204    dedup_exact_clones(&mut clones);
205
206    clones.sort_by(|a, b| {
207        (
208            &a.fragment_a.source_id,
209            a.fragment_a.start.line,
210            &a.fragment_b.source_id,
211            a.fragment_b.start.line,
212        )
213            .cmp(&(
214                &b.fragment_a.source_id,
215                b.fragment_a.start.line,
216                &b.fragment_b.source_id,
217                b.fragment_b.start.line,
218            ))
219    });
220
221    clones
222}
223
224// ---------------------------------------------------------------------------
225// Core detection — per format group
226// ---------------------------------------------------------------------------
227
228fn detect_in_group(
229    prepared: &[PreparedSource],
230    min_tokens: usize,
231    skip_local: bool,
232    min_lines: usize,
233) -> Vec<CpdClone> {
234    // Precompute window_power once for this format group.
235    // If per-language min_tokens is introduced, recompute per group (it is already scoped here).
236    let window_power = base_pow(min_tokens.saturating_sub(1));
237
238    // Pre-allocate store capacity to avoid FxHashMap rehashing.
239    let total_windows: usize = prepared
240        .iter()
241        .map(|p| p.hashes.len().saturating_sub(min_tokens))
242        .sum();
243    let mut store: WindowStore =
244        FxHashMap::with_capacity_and_hasher(total_windows, Default::default());
245
246    let mut clones: Vec<CpdClone> = Vec::new();
247    // Cap repeated-window occurrences per hash. Higher values find more clone pairs
248    // among 3+ similar files (e.g., file_1.js, file_1.mjs, file_1.cjs) but use more memory.
249    // The TypeScript jscpd compares all file pairs, so we raise this to match its coverage.
250    const SECONDARY_OCCURRENCE_CAP: usize = 2;
251    let mut repeated_windows: FxHashMap<u64, Vec<Occurrence>> = FxHashMap::default();
252
253    for (file_idx, source) in prepared.iter().enumerate() {
254        let hashes = &source.hashes;
255        if hashes.len() < min_tokens {
256            continue;
257        }
258        let windows_len = hashes.len() - min_tokens + 1;
259
260        // open_clone state machine: replaces emit-every-window + suppress_subclones.
261        // A clone is opened when a matching window is found and enlarged as long as
262        // subsequent windows also match. flush_clone is called when the match breaks
263        // or the file scan ends — only one clone per contiguous matching region.
264        let mut open_clone: Option<OpenClone> = None;
265
266        let mut window_hash = hash_window(&hashes[..min_tokens]);
267
268        for token_start in 0..windows_len {
269            if token_start > 0 {
270                window_hash = roll(
271                    window_hash,
272                    hashes[token_start - 1],
273                    hashes[token_start + min_tokens - 1],
274                    window_power,
275                );
276            }
277
278            let current = Occurrence {
279                source_id: file_idx,
280                token_start,
281            };
282
283            match store.get(&window_hash).copied() {
284                Some(stored) if windows_match(stored, current, prepared, min_tokens) => {
285                    if open_clone.is_none() {
286                        open_clone = Some(OpenClone {
287                            stored_occurrence: stored,
288                            current_start: token_start,
289                            match_len: min_tokens,
290                        });
291                    } else if let Some(ref mut oc) = open_clone {
292                        // Enlarge: the next window also matches — extend by one token.
293                        oc.match_len += 1;
294                    }
295                    remember_repeated_window(
296                        &mut repeated_windows,
297                        window_hash,
298                        stored,
299                        SECONDARY_OCCURRENCE_CAP,
300                    );
301                    remember_repeated_window(
302                        &mut repeated_windows,
303                        window_hash,
304                        current,
305                        SECONDARY_OCCURRENCE_CAP,
306                    );
307                    // Do NOT update store — keep the first occurrence so the enlargement
308                    // stays consistent across the contiguous match region.
309                }
310                _ => {
311                    // Match broke (or no entry). Flush whatever was open.
312                    flush_clone(
313                        open_clone.take(),
314                        file_idx,
315                        prepared,
316                        skip_local,
317                        min_lines,
318                        &mut clones,
319                    );
320                    store.insert(window_hash, current);
321                }
322            }
323        }
324
325        // Flush any open clone at the end of the file scan.
326        flush_clone(
327            open_clone.take(),
328            file_idx,
329            prepared,
330            skip_local,
331            min_lines,
332            &mut clones,
333        );
334    }
335
336    add_secondary_clones(
337        repeated_windows,
338        prepared,
339        min_tokens,
340        skip_local,
341        min_lines,
342        &mut clones,
343    );
344
345    clones
346}
347
348// ---------------------------------------------------------------------------
349// Open clone state machine helpers
350// ---------------------------------------------------------------------------
351
352struct OpenClone {
353    stored_occurrence: Occurrence,
354    current_start: usize,
355    match_len: usize,
356}
357
358/// Returns true if the window at `current` actually matches the window at `stored`
359/// (hash match is necessary but not sufficient — verify token equality).
360fn windows_match(
361    stored: Occurrence,
362    current: Occurrence,
363    prepared: &[PreparedSource],
364    min_tokens: usize,
365) -> bool {
366    if stored.source_id == current.source_id && stored.token_start == current.token_start {
367        return false;
368    }
369    let stored_hashes = &prepared[stored.source_id].hashes;
370    let current_hashes = &prepared[current.source_id].hashes;
371    if stored.token_start + min_tokens > stored_hashes.len()
372        || current.token_start + min_tokens > current_hashes.len()
373    {
374        return false;
375    }
376    stored_hashes[stored.token_start..stored.token_start + min_tokens]
377        == current_hashes[current.token_start..current.token_start + min_tokens]
378}
379
380/// Flush an open clone to the clones list.
381///
382/// A clone is rejected if its line span is shorter than `min_lines`.
383/// The line span is measured as `end.line - start.line` (which equals
384/// `number_of_lines - 1`). Mirrors jscpd's `LinesLengthCloneValidator`.
385fn flush_clone(
386    open: Option<OpenClone>,
387    current_file_idx: usize,
388    prepared: &[PreparedSource],
389    skip_local: bool,
390    min_lines: usize,
391    clones: &mut Vec<CpdClone>,
392) {
393    let oc = match open {
394        Some(o) => o,
395        None => return,
396    };
397
398    let existing = &oc.stored_occurrence;
399    let cur_start = oc.current_start;
400    let match_len = oc.match_len;
401
402    let existing_file = &prepared[existing.source_id];
403    let current_file = &prepared[current_file_idx];
404
405    let ex_start = existing.token_start;
406    let ex_end = ex_start + match_len - 1;
407    let cur_end = cur_start + match_len - 1;
408
409    // skip_local: drop clone pairs where both fragments share the same parent directory.
410    if skip_local {
411        let dir_a = Path::new(&existing_file.id).parent();
412        let dir_b = Path::new(&current_file.id).parent();
413        if dir_a == dir_b {
414            return;
415        }
416    }
417
418    let fragment_a = match make_fragment(&existing_file.id, &existing_file.spans, ex_start, ex_end)
419    {
420        Some(f) => f,
421        None => return,
422    };
423    let fragment_b = match make_fragment(&current_file.id, &current_file.spans, cur_start, cur_end)
424    {
425        Some(f) => f,
426        None => return,
427    };
428
429    // min_lines filter: reject clones whose fragment A line span is shorter than min_lines.
430    // Mirrors jscpd's LinesLengthCloneValidator which checks only duplicationA:
431    //   duplicationA.end.line - duplicationA.start.line >= minLines
432    if min_lines > 0 {
433        let lines = fragment_a.end.line as usize - fragment_a.start.line as usize;
434        if lines < min_lines {
435            return;
436        }
437    }
438
439    clones.push(CpdClone {
440        format: current_file.format.clone(),
441        fragment_a,
442        fragment_b,
443        token_count: match_len as u32,
444    });
445}
446
447fn make_fragment(
448    source_id: &str,
449    spans: &[(Location, Location)],
450    start_idx: usize,
451    end_idx: usize,
452) -> Option<Fragment> {
453    let (first_start, _) = spans.get(start_idx)?;
454    let (_, last_end) = spans.get(end_idx)?;
455    Some(Fragment {
456        source_id: source_id.to_string(),
457        start: first_start.clone(),
458        end: last_end.clone(),
459        range: [start_idx as u32, end_idx as u32],
460        blame: None,
461    })
462}
463
464// ---------------------------------------------------------------------------
465// Deduplication — O(n) FxHashSet + sub-clone suppression
466// ---------------------------------------------------------------------------
467
468fn dedup_exact_clones(clones: &mut Vec<CpdClone>) {
469    // Normalize each clone so fragment_a <= fragment_b (by id then start line).
470    for clone in clones.iter_mut() {
471        let a_key = (&clone.fragment_a.source_id, clone.fragment_a.start.line);
472        let b_key = (&clone.fragment_b.source_id, clone.fragment_b.start.line);
473        if a_key > b_key {
474            std::mem::swap(&mut clone.fragment_a, &mut clone.fragment_b);
475        }
476    }
477
478    let mut seen: FxHashSet<CloneDedupKey> = FxHashSet::default();
479    clones.retain(|c| seen.insert(CloneDedupKey::from_clone(c)));
480}
481
482// ---------------------------------------------------------------------------
483// Secondary clone pass
484// ---------------------------------------------------------------------------
485
486fn remember_repeated_window(
487    repeated_windows: &mut FxHashMap<u64, Vec<Occurrence>>,
488    hash: u64,
489    occurrence: Occurrence,
490    cap: usize,
491) {
492    let bucket = repeated_windows.entry(hash).or_default();
493    if bucket
494        .iter()
495        .any(|s| s.source_id == occurrence.source_id && s.token_start == occurrence.token_start)
496    {
497        return;
498    }
499    if bucket.len() < cap {
500        bucket.push(occurrence);
501    }
502}
503
504struct SecondaryOpen {
505    clone: CpdClone,
506    source_a: usize,
507    source_b: usize,
508    last_token_start_a: usize,
509    last_token_start_b: usize,
510}
511
512fn add_secondary_clones(
513    repeated_windows: FxHashMap<u64, Vec<Occurrence>>,
514    prepared: &[PreparedSource],
515    min_tokens: usize,
516    skip_local: bool,
517    min_lines: usize,
518    clones: &mut Vec<CpdClone>,
519) {
520    if repeated_windows.is_empty() {
521        return;
522    }
523
524    #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
525    struct Candidate {
526        source_a: usize,
527        source_b: usize,
528        token_a: usize,
529        token_b: usize,
530    }
531
532    let mut candidates: Vec<Candidate> = Vec::new();
533    for occurrences in repeated_windows.values() {
534        if occurrences.len() < 2 {
535            continue;
536        }
537        for li in 0..occurrences.len() {
538            for ri in li + 1..occurrences.len() {
539                let left = &occurrences[li];
540                let right = &occurrences[ri];
541                if left.source_id == right.source_id && left.token_start == right.token_start {
542                    continue;
543                }
544                let lh = &prepared[left.source_id].hashes;
545                let rh = &prepared[right.source_id].hashes;
546                let la = left.token_start;
547                let ra = right.token_start;
548                if la + min_tokens > lh.len() || ra + min_tokens > rh.len() {
549                    continue;
550                }
551                if lh[la..la + min_tokens] != rh[ra..ra + min_tokens] {
552                    continue;
553                }
554                let (sa, ta, sb, tb) =
555                    if (left.source_id, left.token_start) <= (right.source_id, right.token_start) {
556                        (
557                            left.source_id,
558                            left.token_start,
559                            right.source_id,
560                            right.token_start,
561                        )
562                    } else {
563                        (
564                            right.source_id,
565                            right.token_start,
566                            left.source_id,
567                            left.token_start,
568                        )
569                    };
570                candidates.push(Candidate {
571                    source_a: sa,
572                    source_b: sb,
573                    token_a: ta,
574                    token_b: tb,
575                });
576            }
577        }
578    }
579    if candidates.is_empty() {
580        return;
581    }
582    candidates.sort_unstable();
583    candidates.dedup();
584
585    // Build line-coverage from already-found primary clones.
586    let mut coverage = LineCoverage::from_clones(prepared, clones);
587    let mut open: Option<SecondaryOpen> = None;
588
589    for candidate in candidates {
590        if let Some(current) = open.as_mut()
591            && current.source_a == candidate.source_a
592            && current.source_b == candidate.source_b
593            && current.last_token_start_a + 1 == candidate.token_a
594            && current.last_token_start_b + 1 == candidate.token_b
595        {
596            // Enlarge: extend the open secondary clone by one token on each side.
597            let new_match_len = current.clone.token_count as usize + 1;
598            let end_idx_a = candidate.token_a + min_tokens;
599            let end_idx_b = candidate.token_b + min_tokens;
600            if let Some(frag_a_end) = prepared[current.source_a].spans.get(end_idx_a) {
601                current.clone.fragment_a.end = frag_a_end.1.clone();
602                current.clone.fragment_a.range[1] = end_idx_a as u32;
603            }
604            if let Some(frag_b_end) = prepared[current.source_b].spans.get(end_idx_b) {
605                current.clone.fragment_b.end = frag_b_end.1.clone();
606                current.clone.fragment_b.range[1] = end_idx_b as u32;
607            }
608            current.clone.token_count = new_match_len as u32;
609            current.last_token_start_a = candidate.token_a;
610            current.last_token_start_b = candidate.token_b;
611            continue;
612        }
613
614        flush_secondary_clone(
615            open.take(),
616            prepared,
617            skip_local,
618            min_lines,
619            clones,
620            &mut coverage,
621        );
622
623        // Create a new secondary clone candidate.
624        let start_a = candidate.token_a;
625        let end_a = start_a + min_tokens - 1;
626        let start_b = candidate.token_b;
627        let end_b = start_b + min_tokens - 1;
628
629        let frag_a = match make_fragment(
630            &prepared[candidate.source_a].id,
631            &prepared[candidate.source_a].spans,
632            start_a,
633            end_a,
634        ) {
635            Some(f) => f,
636            None => continue,
637        };
638        let frag_b = match make_fragment(
639            &prepared[candidate.source_b].id,
640            &prepared[candidate.source_b].spans,
641            start_b,
642            end_b,
643        ) {
644            Some(f) => f,
645            None => continue,
646        };
647
648        open = Some(SecondaryOpen {
649            clone: CpdClone {
650                format: prepared[candidate.source_a].format.clone(),
651                fragment_a: frag_a,
652                fragment_b: frag_b,
653                token_count: min_tokens as u32,
654            },
655            source_a: candidate.source_a,
656            source_b: candidate.source_b,
657            last_token_start_a: candidate.token_a,
658            last_token_start_b: candidate.token_b,
659        });
660    }
661
662    flush_secondary_clone(
663        open.take(),
664        prepared,
665        skip_local,
666        min_lines,
667        clones,
668        &mut coverage,
669    );
670}
671
672fn flush_secondary_clone(
673    open: Option<SecondaryOpen>,
674    prepared: &[PreparedSource],
675    skip_local: bool,
676    min_lines: usize,
677    clones: &mut Vec<CpdClone>,
678    coverage: &mut LineCoverage,
679) {
680    let Some(oc) = open else {
681        return;
682    };
683
684    let range_a = fragment_line_range(&oc.clone.fragment_a);
685    let range_b = fragment_line_range(&oc.clone.fragment_b);
686
687    // skip_local check
688    if skip_local {
689        let dir_a = Path::new(&prepared[oc.source_a].id).parent();
690        let dir_b = Path::new(&prepared[oc.source_b].id).parent();
691        if dir_a == dir_b {
692            return;
693        }
694    }
695
696    // min_lines filter: only check fragment A, mirroring jscpd's LinesLengthCloneValidator.
697    if min_lines > 0 {
698        let lines = oc.clone.fragment_a.end.line as usize - oc.clone.fragment_a.start.line as usize;
699        if lines < min_lines {
700            return;
701        }
702    }
703
704    // Line-coverage filter: skip secondary clones that don't extend existing coverage
705    // on either side.  This prevents the report from filling up with dozens of
706    // overlapping sub-clones of the same region.
707    if !coverage.extends(oc.source_a, range_a) || !coverage.extends(oc.source_b, range_b) {
708        return;
709    }
710
711    let before = clones.len();
712    clones.push(oc.clone);
713
714    // Insert coverage for newly added clone.
715    if clones.len() > before {
716        coverage.insert(oc.source_a, range_a);
717        coverage.insert(oc.source_b, range_b);
718    }
719}
720
721fn fragment_line_range(fragment: &Fragment) -> (usize, usize) {
722    let start = fragment.start.line as usize;
723    let end = fragment.end.line as usize;
724    (start.min(end), start.max(end))
725}
726
727// ---------------------------------------------------------------------------
728// Line coverage tracking for secondary clones
729// ---------------------------------------------------------------------------
730
731struct LineCoverage {
732    ranges_by_source: Vec<Vec<(usize, usize)>>,
733}
734
735impl LineCoverage {
736    fn from_clones(prepared: &[PreparedSource], clones: &[CpdClone]) -> Self {
737        let mut source_lookup: FxHashMap<&str, usize> = FxHashMap::default();
738        for (idx, source) in prepared.iter().enumerate() {
739            source_lookup.insert(source.id.as_str(), idx);
740        }
741        let mut coverage = Self {
742            ranges_by_source: vec![Vec::new(); prepared.len()],
743        };
744        for clone in clones {
745            if let Some(idx) = source_lookup.get(clone.fragment_a.source_id.as_str()) {
746                coverage.insert(*idx, fragment_line_range(&clone.fragment_a));
747            }
748            if let Some(idx) = source_lookup.get(clone.fragment_b.source_id.as_str()) {
749                coverage.insert(*idx, fragment_line_range(&clone.fragment_b));
750            }
751        }
752        coverage
753    }
754
755    fn extends(&self, source_idx: usize, range: (usize, usize)) -> bool {
756        let Some(ranges) = self.ranges_by_source.get(source_idx) else {
757            return true;
758        };
759        let mut next_line = range.0;
760        for &(start, end) in ranges {
761            if end < next_line {
762                continue;
763            }
764            if start > next_line {
765                return true;
766            }
767            next_line = next_line.max(end.saturating_add(1));
768            if next_line > range.1 {
769                return false;
770            }
771        }
772        next_line <= range.1
773    }
774
775    fn insert(&mut self, source_idx: usize, range: (usize, usize)) {
776        let Some(ranges) = self.ranges_by_source.get_mut(source_idx) else {
777            return;
778        };
779        ranges.push(range);
780        ranges.sort_unstable();
781
782        let mut merged: Vec<(usize, usize)> = Vec::with_capacity(ranges.len());
783        for &(start, end) in ranges.iter() {
784            if let Some((_, previous_end)) = merged.last_mut()
785                && start <= previous_end.saturating_add(1)
786            {
787                *previous_end = (*previous_end).max(end);
788                continue;
789            }
790            merged.push((start, end));
791        }
792        *ranges = merged;
793    }
794}
795
796// ---------------------------------------------------------------------------
797// Tests
798// ---------------------------------------------------------------------------
799
800#[cfg(test)]
801mod tests {
802    use super::*;
803    use crate::models::{Location, Token, TokenKind};
804
805    fn loc(line: u32, col: u32, offset: u32) -> Location {
806        Location {
807            line,
808            column: col,
809            offset,
810        }
811    }
812
813    fn make_token(kind: TokenKind, value: &str, line: u32, col: u32, offset: u32) -> Token {
814        let end_col = col + value.len() as u32;
815        let end_off = offset + value.len() as u32;
816        Token {
817            kind,
818            value: value.to_string(),
819            start: loc(line, col, offset),
820            end: loc(line, end_col, end_off),
821        }
822    }
823
824    fn make_file(id: &str, format: &str, tokens: Vec<Token>) -> SourceFile {
825        SourceFile {
826            id: id.to_string(),
827            format: format.to_string(),
828            tokens,
829        }
830    }
831
832    fn js_tokens_ab() -> Vec<Token> {
833        vec![
834            make_token(TokenKind::Keyword, "function", 1, 0, 0),
835            make_token(TokenKind::Other, "hello", 1, 9, 9),
836            make_token(TokenKind::Operator, "(", 1, 14, 14),
837            make_token(TokenKind::Operator, ")", 1, 15, 15),
838            make_token(TokenKind::Operator, "{", 1, 16, 16),
839            make_token(TokenKind::Keyword, "return", 2, 0, 18),
840            make_token(TokenKind::Literal, "42", 2, 7, 25),
841            make_token(TokenKind::Operator, ";", 2, 9, 27),
842            make_token(TokenKind::Operator, "}", 3, 0, 29),
843        ]
844    }
845
846    #[test]
847    fn empty_input_returns_empty() {
848        let result = detect(&[], 10);
849        assert!(result.is_empty());
850    }
851
852    #[test]
853    fn identical_files_detected_as_clone() {
854        let tokens = js_tokens_ab();
855        let file_a = make_file("a.js", "javascript", tokens.clone());
856        let file_b = make_file("b.js", "javascript", tokens);
857        let clones = detect(&[file_a, file_b], 5);
858        assert!(
859            !clones.is_empty(),
860            "identical files must produce at least one clone"
861        );
862    }
863
864    #[test]
865    fn min_tokens_threshold_respected() {
866        let tokens = js_tokens_ab(); // 9 tokens
867        let file_a = make_file("a.js", "javascript", tokens.clone());
868        let file_b = make_file("b.js", "javascript", tokens);
869        let clones = detect(&[file_a, file_b], 100);
870        assert!(
871            clones.is_empty(),
872            "no clones when min_tokens exceeds file length"
873        );
874    }
875
876    #[test]
877    fn deduplication_ab_ba_collapse() {
878        let tokens = js_tokens_ab();
879        let file_a = make_file("a.js", "javascript", tokens.clone());
880        let file_b = make_file("b.js", "javascript", tokens);
881        let clones = detect(&[file_a, file_b], 5);
882        // Should get exactly 1 clone pair, not 2 symmetric ones.
883        assert_eq!(clones.len(), 1, "symmetric pairs must collapse to 1");
884    }
885
886    #[test]
887    fn different_formats_not_cross_detected() {
888        let tokens = js_tokens_ab();
889        let file_js = make_file("a.js", "javascript", tokens.clone());
890        let file_py = make_file("a.py", "python", tokens);
891        let clones = detect(&[file_js, file_py], 5);
892        assert!(
893            clones.is_empty(),
894            "tokens from different formats must not match"
895        );
896    }
897
898    #[test]
899    fn identical_files_maximal_clone() {
900        // With the open_clone state machine, a single maximal clone is emitted
901        // instead of multiple sliding-window sub-clones.
902        let tokens = js_tokens_ab();
903        let file_a = make_file("a.js", "javascript", tokens.clone());
904        let file_b = make_file("b.js", "javascript", tokens);
905        let clones = detect(&[file_a, file_b], 5);
906        assert_eq!(
907            clones.len(),
908            1,
909            "open_clone SM must produce one maximal clone"
910        );
911        assert_eq!(
912            clones[0].token_count, 9,
913            "maximal clone must cover all 9 tokens"
914        );
915    }
916
917    #[test]
918    fn three_identical_files_secondary_pass_adds_missing_pair() {
919        let tokens = js_tokens_ab();
920        let file_a = make_file("a.js", "javascript", tokens.clone());
921        let file_b = make_file("b.js", "javascript", tokens.clone());
922        let file_c = make_file("c.js", "javascript", tokens);
923        let clones = detect(&[file_a, file_b, file_c], 5);
924        assert!(
925            clones.len() >= 2,
926            "three identical files must yield at least 2 clone pairs, got {}",
927            clones.len()
928        );
929    }
930
931    #[test]
932    fn clones_sorted_by_source_and_line() {
933        let tokens = js_tokens_ab();
934        let file_a = make_file("a.js", "javascript", tokens.clone());
935        let file_b = make_file("b.js", "javascript", tokens);
936        let clones = detect(&[file_a, file_b], 5);
937        for i in 1..clones.len() {
938            let prev = &clones[i - 1];
939            let curr = &clones[i];
940            assert!(
941                (
942                    &prev.fragment_a.source_id,
943                    prev.fragment_a.start.line,
944                    &prev.fragment_b.source_id,
945                    prev.fragment_b.start.line,
946                ) <= (
947                    &curr.fragment_a.source_id,
948                    curr.fragment_a.start.line,
949                    &curr.fragment_b.source_id,
950                    curr.fragment_b.start.line,
951                ),
952                "clones must be sorted"
953            );
954        }
955    }
956}
cpd_core/detect.rs

cpd_core/
detect.rs