jscpd-rs 0.1.1

50x+ faster duplicate-code detector for CI/CD; jscpd-compatible CLI, SARIF, JSON, HTML reports
Documentation
use rustc_hash::FxHashMap;

use crate::cli::Options;
use crate::files::SourceFile;
use crate::tokenizer::{DetectionToken, tokenize_maps_for_detection};

use super::model::{FormatId, PreparedSourceDraft, SourceMeta, TokenSpan};

pub(super) fn assign_formats(files: &[PreparedSourceDraft]) -> (Vec<FormatId>, Vec<String>) {
    let mut by_name = FxHashMap::default();
    let mut names = Vec::new();
    let ids = files
        .iter()
        .map(|file| {
            if let Some(id) = by_name.get(&file.meta.format) {
                *id
            } else {
                let id = FormatId(names.len());
                by_name.insert(file.meta.format.clone(), id);
                names.push(file.meta.format.clone());
                id
            }
        })
        .collect();
    (ids, names)
}

pub(super) fn prepare_file_maps(file: SourceFile, options: &Options) -> Vec<PreparedSourceDraft> {
    tokenize_maps_for_detection(&file.content, &file.format, options)
        .into_iter()
        .map(|map| {
            let (hashes, spans) = split_tokens(map.tokens);
            let (stat_lines, stat_tokens) = token_stream_statistics(&spans);
            PreparedSourceDraft {
                meta: SourceMeta {
                    source_id: file.source_id.clone(),
                    format: map.format,
                    content: file.content.clone(),
                    lines: stat_lines,
                    tokens: stat_tokens,
                },
                hashes,
                spans,
            }
        })
        .collect()
}

fn split_tokens(tokens: Vec<DetectionToken>) -> (Vec<u64>, Vec<TokenSpan>) {
    let mut hashes = Vec::with_capacity(tokens.len());
    let mut spans = Vec::with_capacity(tokens.len());
    for token in tokens {
        hashes.push(token.hash);
        spans.push(TokenSpan {
            start: token.start,
            end: token.end,
            range: token.range,
        });
    }
    (hashes, spans)
}

fn token_stream_statistics(spans: &[TokenSpan]) -> (usize, usize) {
    match (spans.first(), spans.last()) {
        (Some(first), Some(last)) => (
            last.end.line.saturating_sub(first.start.line),
            last.end.position.saturating_sub(first.start.position),
        ),
        _ => (0, 0),
    }
}