jscpd-rs 0.1.6

50x+ faster duplicate-code detector for CI/CD; jscpd-compatible CLI, SARIF, JSON, HTML reports
Documentation
use std::sync::Arc;

use rustc_hash::FxHashMap;

use crate::cli::Options;
use crate::files::SourceFile;
use crate::tokenizer::{DetectionToken, tokenize_maps_for_detection};

use super::model::{FormatId, PreparedSourceDraft, SourceMeta, TokenSpan};

pub(super) fn assign_formats(files: &[PreparedSourceDraft]) -> (Vec<FormatId>, Vec<String>) {
    let mut by_name = FxHashMap::default();
    let mut names = Vec::new();
    let ids = files
        .iter()
        .map(|file| {
            if let Some(id) = by_name.get(&file.meta.format) {
                *id
            } else {
                let id = FormatId(names.len());
                by_name.insert(file.meta.format.clone(), id);
                names.push(file.meta.format.clone());
                id
            }
        })
        .collect();
    (ids, names)
}

pub(super) fn prepare_file_maps(file: SourceFile, options: &Options) -> Vec<PreparedSourceDraft> {
    let source_id = file.source_id;
    let content = file.content;
    let maps = tokenize_maps_for_detection(&content, &file.format, options);
    let content = Arc::<str>::from(content);

    maps.into_iter()
        .map(|map| {
            let (hashes, spans) = split_tokens(map.tokens);
            let (stat_lines, stat_tokens) = token_stream_statistics(&spans);
            PreparedSourceDraft {
                meta: SourceMeta {
                    source_id: source_id.clone(),
                    format: map.format,
                    lines: stat_lines,
                    tokens: stat_tokens,
                },
                content: Arc::clone(&content),
                hashes,
                spans,
            }
        })
        .collect()
}

fn split_tokens(tokens: Vec<DetectionToken>) -> (Arc<Vec<u64>>, Arc<Vec<TokenSpan>>) {
    let mut hashes = Vec::with_capacity(tokens.len());
    let mut spans = Vec::with_capacity(tokens.len());
    for token in tokens {
        hashes.push(token.hash);
        spans.push(TokenSpan {
            start: token.start,
            end: token.end,
            range: token.range,
        });
    }
    (Arc::new(hashes), Arc::new(spans))
}

fn token_stream_statistics(spans: &[TokenSpan]) -> (usize, usize) {
    match (spans.first(), spans.last()) {
        (Some(first), Some(last)) => (
            last.end.line.saturating_sub(first.start.line),
            last.end.position.saturating_sub(first.start.position),
        ),
        _ => (0, 0),
    }
}