fallow-core 2.101.0

use rustc_hash::FxHashMap;

use super::boundary::{build_boundary_prefixes, range_contains_boundary};
use super::extraction::RawGroup;
use super::{CorpusTotals, FileData, filtering, ranking, statistics};
use crate::duplicates::types::DuplicationReport;

const HASH_BASE: u64 = 0x0000_0100_0000_01b3;
const MAX_ADJACENCY_LOOKAHEAD_PAIRS: usize = 512;
const CHILD_BUCKET_PREALLOC_LIMIT: usize = 64;

#[derive(Clone, Copy)]
struct Occurrence {
    file_id: usize,
    offset: usize,
}

struct SeedClass {
    key_file_id: usize,
    key_offset: usize,
    occurrences: SeedOccurrences,
}

enum SeedOccurrences {
    One(Occurrence),
    Many(Vec<Occurrence>),
}

enum SeedBucket {
    One(SeedClass),
    Many(Vec<SeedClass>),
}

impl SeedOccurrences {
    fn push(&mut self, occurrence: Occurrence) {
        match self {
            Self::One(first) => {
                *self = Self::Many(vec![*first, occurrence]);
            }
            Self::Many(occurrences) => occurrences.push(occurrence),
        }
    }

    fn into_many(self) -> Option<Vec<Occurrence>> {
        match self {
            Self::One(_) => None,
            Self::Many(occurrences) => Some(occurrences),
        }
    }
}

impl SeedBucket {
    fn insert_or_append(
        &mut self,
        ranked_files: &[Vec<u32>],
        file_id: usize,
        offset: usize,
        min_tokens: usize,
    ) {
        match self {
            Self::One(class) => {
                if same_window(
                    ranked_files,
                    class.key_file_id,
                    class.key_offset,
                    file_id,
                    offset,
                    min_tokens,
                ) {
                    class.occurrences.push(Occurrence { file_id, offset });
                } else {
                    let first = std::mem::replace(
                        class,
                        SeedClass {
                            key_file_id: file_id,
                            key_offset: offset,
                            occurrences: SeedOccurrences::One(Occurrence { file_id, offset }),
                        },
                    );
                    *self = Self::Many(vec![
                        first,
                        SeedClass {
                            key_file_id: file_id,
                            key_offset: offset,
                            occurrences: SeedOccurrences::One(Occurrence { file_id, offset }),
                        },
                    ]);
                }
            }
            Self::Many(classes) => {
                if let Some(class) = classes.iter_mut().find(|class| {
                    same_window(
                        ranked_files,
                        class.key_file_id,
                        class.key_offset,
                        file_id,
                        offset,
                        min_tokens,
                    )
                }) {
                    class.occurrences.push(Occurrence { file_id, offset });
                } else {
                    classes.push(SeedClass {
                        key_file_id: file_id,
                        key_offset: offset,
                        occurrences: SeedOccurrences::One(Occurrence { file_id, offset }),
                    });
                }
            }
        }
    }
}

pub(super) fn detect(
    files: &[FileData],
    min_tokens: usize,
    min_lines: usize,
    skip_local: bool,
    totals: CorpusTotals,
    may_have_boundaries: bool,
) -> DuplicationReport {
    let t0 = std::time::Instant::now();
    let ranked_files = ranking::rank_reduce(files);
    let rank_time = t0.elapsed();

    let t0 = std::time::Instant::now();
    let boundary_prefixes = if may_have_boundaries {
        build_boundary_prefixes(files)
    } else {
        std::iter::repeat_with(|| None).take(files.len()).collect()
    };
    let has_boundaries = may_have_boundaries && boundary_prefixes.iter().any(Option::is_some);
    let seed_buckets = build_seed_buckets(
        &ranked_files,
        &boundary_prefixes,
        has_boundaries,
        min_tokens,
    );
    let seed_time = t0.elapsed();

    let t0 = std::time::Instant::now();
    let (raw_groups, seed_class_count) = build_raw_groups(
        &ranked_files,
        &boundary_prefixes,
        has_boundaries,
        seed_buckets,
        min_tokens,
    );
    let raw_time = t0.elapsed();
    tracing::debug!(
        elapsed_us = raw_time.as_micros(),
        raw_groups = raw_groups.len(),
        seed_classes = seed_class_count,
        "rolling_step3_raw_groups"
    );

    let t0 = std::time::Instant::now();
    let clone_groups = filtering::build_groups(raw_groups, files, min_lines, skip_local);
    let build_time = t0.elapsed();

    let t0 = std::time::Instant::now();
    let stats = statistics::compute_stats(&clone_groups, totals.files, totals.lines, totals.tokens);
    let stats_time = t0.elapsed();

    tracing::info!(
        total_us = (rank_time + seed_time + raw_time + build_time + stats_time).as_micros(),
        rank_us = rank_time.as_micros(),
        seed_us = seed_time.as_micros(),
        raw_us = raw_time.as_micros(),
        build_us = build_time.as_micros(),
        stats_us = stats_time.as_micros(),
        total_tokens = totals.tokens,
        clone_groups = clone_groups.len(),
        "rolling clone detection complete"
    );

    DuplicationReport {
        clone_groups,
        clone_families: vec![],
        mirrored_directories: vec![],
        stats,
    }
}

fn build_seed_buckets(
    ranked_files: &[Vec<u32>],
    boundary_prefixes: &[Option<Vec<u32>>],
    has_boundaries: bool,
    min_tokens: usize,
) -> FxHashMap<u64, SeedBucket> {
    let mut buckets: FxHashMap<u64, SeedBucket> = FxHashMap::default();
    let base_power = hash_base_power(min_tokens);

    for (file_id, tokens) in ranked_files.iter().enumerate() {
        if tokens.len() < min_tokens {
            continue;
        }

        let last_offset = tokens.len() - min_tokens;
        let mut hash = initial_window_hash(tokens, min_tokens);
        for offset in 0..=tokens.len() - min_tokens {
            if has_boundaries
                && range_contains_boundary(boundary_prefixes[file_id].as_ref(), offset, min_tokens)
            {
                if offset < last_offset {
                    hash = roll_window_hash(
                        hash,
                        tokens[offset],
                        tokens[offset + min_tokens],
                        base_power,
                    );
                }
                continue;
            }

            match buckets.get_mut(&hash) {
                Some(bucket) => {
                    bucket.insert_or_append(ranked_files, file_id, offset, min_tokens);
                }
                None => {
                    buckets.insert(
                        hash,
                        SeedBucket::One(SeedClass {
                            key_file_id: file_id,
                            key_offset: offset,
                            occurrences: SeedOccurrences::One(Occurrence { file_id, offset }),
                        }),
                    );
                }
            }

            if offset < last_offset {
                hash = roll_window_hash(
                    hash,
                    tokens[offset],
                    tokens[offset + min_tokens],
                    base_power,
                );
            }
        }
    }

    buckets
}

fn build_raw_groups(
    ranked_files: &[Vec<u32>],
    boundary_prefixes: &[Option<Vec<u32>>],
    has_boundaries: bool,
    seed_buckets: FxHashMap<u64, SeedBucket>,
    min_tokens: usize,
) -> (Vec<RawGroup>, usize) {
    let mut groups = Vec::new();
    let mut seed_class_count = 0_usize;

    {
        let mut ctx = SeedClassContext {
            ranked_files,
            boundary_prefixes,
            has_boundaries,
            min_tokens,
            seed_class_count: &mut seed_class_count,
            groups: &mut groups,
        };

        for bucket in seed_buckets.into_values() {
            match bucket {
                SeedBucket::One(class) => {
                    process_seed_class(&mut ctx, class);
                }
                SeedBucket::Many(classes) => {
                    for class in classes {
                        process_seed_class(&mut ctx, class);
                    }
                }
            }
        }
    }

    (groups, seed_class_count)
}

struct SeedClassContext<'a, 'b> {
    ranked_files: &'a [Vec<u32>],
    boundary_prefixes: &'a [Option<Vec<u32>>],
    has_boundaries: bool,
    min_tokens: usize,
    seed_class_count: &'b mut usize,
    groups: &'b mut Vec<RawGroup>,
}

fn process_seed_class(ctx: &mut SeedClassContext<'_, '_>, class: SeedClass) {
    let Some(occurrences) = class.occurrences.into_many() else {
        return;
    };
    *ctx.seed_class_count += 1;

    let is_left_maximal = is_left_maximal(
        ctx.ranked_files,
        ctx.boundary_prefixes,
        ctx.has_boundaries,
        &occurrences,
        ctx.min_tokens,
    );

    if is_left_maximal {
        collect_extended_groups(
            ctx.ranked_files,
            ctx.boundary_prefixes,
            ctx.has_boundaries,
            &occurrences,
            ctx.min_tokens,
            ctx.groups,
        );
    }
}

fn same_window(
    ranked_files: &[Vec<u32>],
    left_file_id: usize,
    left_offset: usize,
    right_file_id: usize,
    right_offset: usize,
    length: usize,
) -> bool {
    ranked_files[left_file_id][left_offset..left_offset + length]
        == ranked_files[right_file_id][right_offset..right_offset + length]
}

fn is_left_maximal(
    ranked_files: &[Vec<u32>],
    boundary_prefixes: &[Option<Vec<u32>>],
    has_boundaries: bool,
    occurrences: &[Occurrence],
    length: usize,
) -> bool {
    let Some(first) = occurrences.first() else {
        return false;
    };
    if first.offset == 0 {
        return true;
    }

    let previous = ranked_files[first.file_id][first.offset - 1];
    if occurrences.iter().any(|occurrence| {
        occurrence.offset == 0
            || ranked_files[occurrence.file_id][occurrence.offset - 1] != previous
            || (has_boundaries
                && range_contains_boundary(
                    boundary_prefixes[occurrence.file_id].as_ref(),
                    occurrence.offset - 1,
                    length + 1,
                ))
    }) {
        return true;
    }

    if has_same_file_pair_at_or_extending_to_adjacency(
        ranked_files,
        boundary_prefixes,
        has_boundaries,
        occurrences,
        length,
    ) {
        return true;
    }

    let current_count = count_non_overlapping_instances(occurrences, length, false);
    let extended_count = count_non_overlapping_instances(occurrences, length + 1, true);
    current_count >= 2 && extended_count < current_count
}

fn collect_extended_groups(
    ranked_files: &[Vec<u32>],
    boundary_prefixes: &[Option<Vec<u32>>],
    has_boundaries: bool,
    occurrences: &[Occurrence],
    mut length: usize,
    groups: &mut Vec<RawGroup>,
) {
    if occurrences.len() == 2 {
        let length = extend_pair_length(
            ranked_files,
            boundary_prefixes,
            has_boundaries,
            occurrences[0],
            occurrences[1],
            length,
        );
        push_raw_group(occurrences, length, groups);
        return;
    }

    length = extend_uniform_length(
        ranked_files,
        boundary_prefixes,
        has_boundaries,
        occurrences,
        length,
    );

    let children = collect_child_occurrences(
        ranked_files,
        boundary_prefixes,
        has_boundaries,
        occurrences,
        length,
    );

    push_raw_group(occurrences, length, groups);

    let Some(children) = children else {
        return;
    };

    for child in children
        .into_values()
        .filter_map(SeedOccurrences::into_many)
    {
        if is_left_maximal(
            ranked_files,
            boundary_prefixes,
            has_boundaries,
            &child,
            length + 1,
        ) {
            collect_extended_groups(
                ranked_files,
                boundary_prefixes,
                has_boundaries,
                &child,
                length + 1,
                groups,
            );
        }
    }
}

fn collect_child_occurrences(
    ranked_files: &[Vec<u32>],
    boundary_prefixes: &[Option<Vec<u32>>],
    has_boundaries: bool,
    occurrences: &[Occurrence],
    length: usize,
) -> Option<FxHashMap<u32, SeedOccurrences>> {
    let mut first_child: Option<(u32, Occurrence)> = None;
    let mut children: Option<FxHashMap<u32, SeedOccurrences>> = None;
    for &occurrence in occurrences {
        let Some(token) = child_token_after_range(
            ranked_files,
            boundary_prefixes,
            has_boundaries,
            occurrence,
            length,
        ) else {
            continue;
        };

        if let Some(children) = children.as_mut() {
            insert_child_occurrence(children, token, occurrence);
        } else if let Some((first_token, first_occurrence)) = first_child.take() {
            let mut child_map = FxHashMap::default();
            child_map.reserve(occurrences.len().min(CHILD_BUCKET_PREALLOC_LIMIT));
            insert_child_occurrence(&mut child_map, first_token, first_occurrence);
            insert_child_occurrence(&mut child_map, token, occurrence);
            children = Some(child_map);
        } else {
            first_child = Some((token, occurrence));
        }
    }

    children
}

fn child_token_after_range(
    ranked_files: &[Vec<u32>],
    boundary_prefixes: &[Option<Vec<u32>>],
    has_boundaries: bool,
    occurrence: Occurrence,
    length: usize,
) -> Option<u32> {
    let tokens = &ranked_files[occurrence.file_id];
    if occurrence.offset + length >= tokens.len()
        || (has_boundaries
            && range_contains_boundary(
                boundary_prefixes[occurrence.file_id].as_ref(),
                occurrence.offset,
                length + 1,
            ))
    {
        return None;
    }

    Some(tokens[occurrence.offset + length])
}

fn insert_child_occurrence(
    children: &mut FxHashMap<u32, SeedOccurrences>,
    token: u32,
    occurrence: Occurrence,
) {
    children
        .entry(token)
        .and_modify(|occurrences| occurrences.push(occurrence))
        .or_insert(SeedOccurrences::One(occurrence));
}

fn extend_uniform_length(
    ranked_files: &[Vec<u32>],
    boundary_prefixes: &[Option<Vec<u32>>],
    has_boundaries: bool,
    occurrences: &[Occurrence],
    mut length: usize,
) -> usize {
    let Some(first) = occurrences.first() else {
        return length;
    };

    loop {
        let Some(next) = next_extension_token(
            ranked_files,
            boundary_prefixes,
            has_boundaries,
            *first,
            length,
        ) else {
            return length;
        };

        if occurrences[1..].iter().all(|&occurrence| {
            next_extension_token(
                ranked_files,
                boundary_prefixes,
                has_boundaries,
                occurrence,
                length,
            ) == Some(next)
        }) {
            length += 1;
            continue;
        }

        return length;
    }
}

fn next_extension_token(
    ranked_files: &[Vec<u32>],
    boundary_prefixes: &[Option<Vec<u32>>],
    has_boundaries: bool,
    occurrence: Occurrence,
    length: usize,
) -> Option<u32> {
    let tokens = &ranked_files[occurrence.file_id];
    if occurrence.offset + length >= tokens.len()
        || (has_boundaries
            && range_contains_boundary(
                boundary_prefixes[occurrence.file_id].as_ref(),
                occurrence.offset,
                length + 1,
            ))
    {
        return None;
    }

    Some(tokens[occurrence.offset + length])
}

fn extend_pair_length(
    ranked_files: &[Vec<u32>],
    boundary_prefixes: &[Option<Vec<u32>>],
    has_boundaries: bool,
    left: Occurrence,
    right: Occurrence,
    length: usize,
) -> usize {
    let mut extended = length;
    let left_tokens = &ranked_files[left.file_id];
    let right_tokens = &ranked_files[right.file_id];
    let left_boundary = has_boundaries
        .then(|| boundary_prefixes[left.file_id].as_ref())
        .flatten();
    let right_boundary = has_boundaries
        .then(|| boundary_prefixes[right.file_id].as_ref())
        .flatten();

    if left_boundary.is_none() && right_boundary.is_none() {
        while left.offset + extended < left_tokens.len()
            && right.offset + extended < right_tokens.len()
            && left_tokens[left.offset + extended] == right_tokens[right.offset + extended]
        {
            extended += 1;
        }

        return extended;
    }

    while left.offset + extended < left_tokens.len()
        && right.offset + extended < right_tokens.len()
        && left_tokens[left.offset + extended] == right_tokens[right.offset + extended]
        && !range_contains_boundary(left_boundary, left.offset, extended + 1)
        && !range_contains_boundary(right_boundary, right.offset, extended + 1)
    {
        extended += 1;
    }

    extended
}

fn count_non_overlapping_instances(
    occurrences: &[Occurrence],
    length: usize,
    shift_left: bool,
) -> usize {
    debug_assert_occurrences_sorted(occurrences);

    let mut count = 0_usize;
    let mut last: Option<(usize, usize)> = None;
    for occurrence in occurrences {
        let file_id = occurrence.file_id;
        let offset = if shift_left {
            occurrence.offset - 1
        } else {
            occurrence.offset
        };
        if let Some((last_file_id, last_offset)) = last
            && file_id == last_file_id
            && offset < last_offset + length
        {
            continue;
        }
        count += 1;
        last = Some((file_id, offset));
    }

    count
}

fn has_same_file_pair_at_or_extending_to_adjacency(
    ranked_files: &[Vec<u32>],
    boundary_prefixes: &[Option<Vec<u32>>],
    has_boundaries: bool,
    occurrences: &[Occurrence],
    length: usize,
) -> bool {
    debug_assert_occurrences_sorted(occurrences);

    let mut checked_pairs = 0_usize;
    for (index, occurrence) in occurrences.iter().enumerate() {
        let file_id = occurrence.file_id;
        let left = occurrence.offset;
        for candidate in &occurrences[index + 1..] {
            let right_file_id = candidate.file_id;
            let right = candidate.offset;
            if file_id != right_file_id {
                break;
            }
            if right == left + length {
                return true;
            }
            if right < left + length {
                continue;
            }
            checked_pairs += 1;
            if checked_pairs > MAX_ADJACENCY_LOOKAHEAD_PAIRS {
                return false;
            }
            if pair_extends_to_adjacency(AdjacentPairInput {
                ranked_files,
                boundary_prefixes,
                has_boundaries,
                file_id,
                left,
                right,
                length,
            }) {
                return true;
            }
        }
    }

    false
}

#[derive(Clone, Copy)]
struct AdjacentPairInput<'a> {
    ranked_files: &'a [Vec<u32>],
    boundary_prefixes: &'a [Option<Vec<u32>>],
    has_boundaries: bool,
    file_id: usize,
    left: usize,
    right: usize,
    length: usize,
}

fn pair_extends_to_adjacency(input: AdjacentPairInput<'_>) -> bool {
    let AdjacentPairInput {
        ranked_files,
        boundary_prefixes,
        has_boundaries,
        file_id,
        left,
        right,
        length,
    } = input;
    let tokens = &ranked_files[file_id];
    let distance = right - left;
    let mut extended = length;
    let boundary = has_boundaries
        .then(|| boundary_prefixes[file_id].as_ref())
        .flatten();

    if boundary.is_none() {
        while extended < distance
            && right + extended < tokens.len()
            && tokens[left + extended] == tokens[right + extended]
        {
            extended += 1;
        }

        return extended == distance;
    }

    while extended < distance
        && right + extended < tokens.len()
        && tokens[left + extended] == tokens[right + extended]
        && !range_contains_boundary(boundary, left, extended + 1)
        && !range_contains_boundary(boundary, right, extended + 1)
    {
        extended += 1;
    }

    extended == distance
}

fn push_raw_group(occurrences: &[Occurrence], length: usize, groups: &mut Vec<RawGroup>) {
    debug_assert_occurrences_sorted(occurrences);

    let mut deduped = Vec::with_capacity(occurrences.len());
    for occurrence in occurrences {
        let file_id = occurrence.file_id;
        let offset = occurrence.offset;
        if let Some(&(last_file_id, last_offset)) = deduped.last()
            && file_id == last_file_id
            && offset < last_offset + length
        {
            continue;
        }
        deduped.push((file_id, offset));
    }

    if deduped.len() >= 2 {
        groups.push(RawGroup {
            instances: deduped,
            length,
        });
    }
}

fn debug_assert_occurrences_sorted(occurrences: &[Occurrence]) {
    debug_assert!(occurrences.windows(2).all(|pair| {
        pair[0].file_id < pair[1].file_id
            || (pair[0].file_id == pair[1].file_id && pair[0].offset <= pair[1].offset)
    }));
}

fn hash_base_power(length: usize) -> u64 {
    let mut power = 1_u64;
    for _ in 1..length {
        power = power.wrapping_mul(HASH_BASE);
    }
    power
}

fn initial_window_hash(tokens: &[u32], length: usize) -> u64 {
    let mut hash = 0_u64;
    for &token in &tokens[..length] {
        hash = hash.wrapping_mul(HASH_BASE).wrapping_add(u64::from(token));
    }
    hash
}

fn roll_window_hash(hash: u64, outgoing: u32, incoming: u32, base_power: u64) -> u64 {
    hash.wrapping_sub(u64::from(outgoing).wrapping_mul(base_power))
        .wrapping_mul(HASH_BASE)
        .wrapping_add(u64::from(incoming))
}

#[cfg(test)]
mod tests {
    use std::path::PathBuf;

    use oxc_span::Span;

    use super::*;
    use crate::duplicates::normalize::HashedToken;
    use crate::duplicates::tokenize::{FileTokens, SourceToken, TokenKind};

    #[expect(
        clippy::cast_possible_truncation,
        reason = "test span values are trivially small"
    )]
    fn make_file(path: &str, hashes: &[u64]) -> FileData {
        let source = std::iter::repeat_n("x", hashes.len())
            .collect::<Vec<_>>()
            .join("\n");
        FileData {
            path: PathBuf::from(path),
            hashed_tokens: hashes
                .iter()
                .enumerate()
                .map(|(index, &hash)| HashedToken {
                    hash,
                    original_index: index,
                })
                .collect(),
            file_tokens: FileTokens {
                tokens: (0..hashes.len())
                    .map(|index| SourceToken {
                        kind: TokenKind::Identifier(format!("t{index}")),
                        span: Span::new((index * 2) as u32, (index * 2 + 1) as u32),
                    })
                    .collect(),
                atomic_invocation_spans: Vec::new(),
                source,
                line_count: hashes.len(),
            },
            atomic_invocation_spans: Vec::new(),
        }
    }

    fn totals(files: &[FileData]) -> CorpusTotals {
        CorpusTotals {
            files: files.len(),
            lines: files.iter().map(|file| file.file_tokens.line_count).sum(),
            tokens: files.iter().map(|file| file.hashed_tokens.len()).sum(),
        }
    }

    #[test]
    fn rolling_detects_exact_duplicate_across_files() {
        let files = vec![
            make_file("a.ts", &[1, 2, 3, 4, 5]),
            make_file("b.ts", &[1, 2, 3, 4, 5]),
        ];

        let report = detect(&files, 3, 1, false, totals(&files), false);

        assert_eq!(report.clone_groups.len(), 1);
        assert_eq!(report.clone_groups[0].instances.len(), 2);
        assert_eq!(report.clone_groups[0].token_count, 5);
    }

    #[test]
    fn rolling_keeps_long_subgroup_when_seed_has_extra_occurrence() {
        let files = vec![
            make_file("a.ts", &[1, 2, 3, 4, 5, 6, 7, 8]),
            make_file("b.ts", &[1, 2, 3, 4, 5, 6, 7, 8]),
            make_file("c.ts", &[1, 2, 3, 9, 10, 11]),
        ];

        let report = detect(&files, 3, 1, false, totals(&files), false);

        assert!(
            report
                .clone_groups
                .iter()
                .any(|group| group.token_count == 8 && group.instances.len() == 2),
            "expected the long pair clone to survive alongside the broad short seed"
        );
    }

    #[test]
    fn rolling_keeps_adjacent_same_file_clone_that_extends_from_seed() {
        let files = vec![make_file("a.ts", &[5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5])];

        let report = detect(&files, 3, 1, false, totals(&files), false);

        assert_eq!(report.clone_groups.len(), 1);
        assert_eq!(report.clone_groups[0].instances.len(), 2);
        assert_eq!(report.clone_groups[0].token_count, 5);
    }

    #[test]
    fn adjacency_lookahead_checks_past_intervening_occurrences() {
        let ranked_files = vec![vec![1, 2, 1, 2, 1, 1, 2, 1, 2, 1]];
        let occurrences = vec![
            Occurrence {
                file_id: 0,
                offset: 0,
            },
            Occurrence {
                file_id: 0,
                offset: 2,
            },
            Occurrence {
                file_id: 0,
                offset: 5,
            },
        ];

        assert!(has_same_file_pair_at_or_extending_to_adjacency(
            &ranked_files,
            &[None],
            false,
            &occurrences,
            3,
        ));
    }

    #[test]
    fn rolling_extends_long_multi_occurrence_clone_iteratively() {
        let tokens: Vec<u32> = (0..20_000).collect();
        let ranked_files = vec![tokens.clone(), tokens.clone(), tokens];
        let occurrences = vec![
            Occurrence {
                file_id: 0,
                offset: 0,
            },
            Occurrence {
                file_id: 1,
                offset: 0,
            },
            Occurrence {
                file_id: 2,
                offset: 0,
            },
        ];
        let mut groups = Vec::new();

        collect_extended_groups(
            &ranked_files,
            &[None, None, None],
            false,
            &occurrences,
            3,
            &mut groups,
        );

        assert_eq!(groups.len(), 1);
        assert_eq!(groups[0].length, 20_000);
        assert_eq!(groups[0].instances, vec![(0, 0), (1, 0), (2, 0)]);
    }
}