jscpd-rs 0.1.6

50x+ faster duplicate-code detector for CI/CD; jscpd-compatible CLI, SARIF, JSON, HTML reports
Documentation
mod apex;
mod blocks;
mod embedded;
mod generic;
mod hash;
mod ignore;
mod line_index;
mod markdown;
mod markup_attrs;
mod oxc;
mod scan;
mod tap;

use serde::Serialize;

use crate::cli::{Mode, Options};

use generic::tokenize_generic;
use hash::hash_token;
use ignore::find_ignore_regions;
use line_index::LineIndex;
use oxc::{is_oxc_format, tokenize_oxc_maps};
use scan::count_prism_whitespace_tokens;

/// One-based source location used in tokens, fragments, and reports.
#[derive(Clone, Debug, Serialize)]
pub struct Location {
    /// One-based line number.
    pub line: usize,
    /// Zero-based column number.
    pub column: usize,
    /// Zero-based byte position in the original source text.
    pub position: usize,
}

/// Detection token after mode filtering and jscpd-compatible hashing.
#[derive(Clone, Debug)]
pub struct DetectionToken {
    /// Stable token hash used by the duplicate detector.
    pub hash: u64,
    /// Start location of the token.
    pub start: Location,
    /// End location of the token.
    pub end: Location,
    /// Byte range in the original source text.
    pub range: [usize; 2],
}

/// Token map for a single detected format block.
///
/// Embedded formats can produce more than one map for one source document, for
/// example script/style blocks extracted from markup-like files.
#[derive(Clone, Debug)]
pub struct TokenMap {
    /// Format name associated with this token map.
    pub format: String,
    /// Detection tokens in source order.
    pub tokens: Vec<DetectionToken>,
    positions_assigned: bool,
}

/// Token map associated with a source identifier and line count.
#[derive(Clone, Debug)]
pub struct SourceTokenMap {
    /// Stable source identifier, usually a file path.
    pub source_id: String,
    /// Format name associated with this token map.
    pub format: String,
    /// Detection tokens in source order.
    pub tokens: Vec<DetectionToken>,
    /// Total source lines represented by this map.
    pub lines: usize,
}

/// Native tokenizer used by the detector.
///
/// JS/TS/JSX/TSX formats use Oxc-backed tokenization. Long-tail formats use
/// the generic native tokenizer unless a format has a dedicated implementation.
#[derive(Clone, Debug)]
pub struct Tokenizer {
    options: Options,
}

impl Default for Tokenizer {
    fn default() -> Self {
        Self::new()
    }
}

impl Tokenizer {
    /// Create a tokenizer with default detector options.
    pub fn new() -> Self {
        Self {
            options: Options::default(),
        }
    }

    /// Create a tokenizer with caller-provided options.
    pub fn with_options(options: Options) -> Self {
        Self { options }
    }

    /// Return the options used by this tokenizer.
    pub fn options(&self) -> &Options {
        &self.options
    }

    /// Mutably access tokenizer options.
    pub fn options_mut(&mut self) -> &mut Options {
        &mut self.options
    }

    /// Tokenize a source string and return the first token stream.
    ///
    /// Use [`Tokenizer::tokenize_maps`] when a format can produce multiple
    /// embedded token maps.
    pub fn tokenize(&self, content: &str, format: &str) -> Vec<DetectionToken> {
        self.tokenize_maps(content, format)
            .into_iter()
            .next()
            .map(|map| map.tokens)
            .unwrap_or_default()
    }

    /// Tokenize source text into one or more format-specific token maps.
    pub fn tokenize_maps(&self, content: &str, format: &str) -> Vec<TokenMap> {
        tokenize_maps_for_detection(content, format, &self.options)
    }

    /// Tokenize source text and attach a source identifier to each generated map.
    pub fn generate_maps(
        &self,
        source_id: impl Into<String>,
        content: &str,
        format: &str,
    ) -> Vec<SourceTokenMap> {
        let source_id = source_id.into();
        self.tokenize_maps(content, format)
            .into_iter()
            .map(|map| SourceTokenMap {
                source_id: source_id.clone(),
                lines: token_map_line_count(&map.tokens),
                format: map.format,
                tokens: map.tokens,
            })
            .collect()
    }
}

#[derive(Clone, Copy, Debug, PartialEq, Eq)]
enum TokenKind {
    Comment,
    Constant,
    Empty,
    Keyword,
    NewLine,
    Number,
    Operator,
    Punctuation,
    String,
    Default,
}

#[derive(Clone, Copy)]
struct ByteSpan {
    start: usize,
    end: usize,
}

struct TokenContext<'a> {
    content: &'a str,
    options: &'a Options,
    ignore_regions: &'a [[usize; 2]],
}

impl TokenContext<'_> {
    fn slice(&self, span: ByteSpan) -> &str {
        &self.content[span.start..span.end]
    }

    fn overlaps_ignore_region(&self, span: ByteSpan) -> bool {
        self.ignore_regions
            .iter()
            .any(|[region_start, region_end]| span.start < *region_end && span.end > *region_start)
    }
}

#[cfg(test)]
fn tokenize_for_detection(content: &str, format: &str, options: &Options) -> Vec<DetectionToken> {
    tokenize_maps_for_detection(content, format, options)
        .into_iter()
        .next()
        .map(|map| map.tokens)
        .unwrap_or_default()
}

pub fn tokenize_maps_for_detection(
    content: &str,
    format: &str,
    options: &Options,
) -> Vec<TokenMap> {
    let ignore_regions = find_ignore_regions(content, options);
    let mut maps = if format == "markdown" {
        markdown::tokenize_maps(content, options, &ignore_regions)
    } else if format == "apex" {
        apex::tokenize_maps(content, options, &ignore_regions)
    } else if format == "tap" {
        tap::tokenize_maps(content, options, &ignore_regions)
    } else if matches!(format, "markup" | "vue" | "svelte" | "astro") {
        blocks::tokenize_maps(content, format, options, &ignore_regions)
    } else if is_oxc_format(format) {
        tokenize_oxc_maps(content, format, options, &ignore_regions)
    } else {
        vec![TokenMap {
            format: format.to_string(),
            tokens: tokenize_generic(content, format, options, &ignore_regions),
            positions_assigned: false,
        }]
    };
    for map in &mut maps {
        if !map.positions_assigned {
            assign_token_positions(content, &map.format, options, &mut map.tokens);
        }
    }
    maps
}

fn token_map_line_count(tokens: &[DetectionToken]) -> usize {
    match (tokens.first(), tokens.last()) {
        (Some(first), Some(last)) => last.end.line.saturating_sub(first.start.line),
        _ => 0,
    }
}

fn assign_token_positions(
    content: &str,
    format: &str,
    options: &Options,
    tokens: &mut [DetectionToken],
) {
    let needs_report_positions =
        options.reporters.iter().any(|reporter| reporter == "json") || !options.silent;
    if !needs_report_positions || !matches!(format, "javascript" | "typescript" | "jsx" | "tsx") {
        for (position, token) in tokens.iter_mut().enumerate() {
            token.start.position = position;
            token.end.position = position;
        }
        return;
    }

    let mut position = 0usize;
    let mut previous_end = 0usize;
    for token in tokens {
        if token.range[0] > previous_end {
            position += count_prism_whitespace_tokens(content, previous_end, token.range[0]);
        }
        token.start.position = position;
        token.end.position = position;
        position += 1;
        previous_end = previous_end.max(token.range[1]);
    }
}

fn push_token(
    tokens: &mut Vec<DetectionToken>,
    context: &TokenContext<'_>,
    kind: TokenKind,
    span: ByteSpan,
    start: Location,
    end: Location,
) {
    if context.options.mode == Mode::Weak && kind == TokenKind::Comment {
        return;
    }
    if context.overlaps_ignore_region(span) {
        return;
    }
    tokens.push(DetectionToken {
        hash: hash_token(kind, context.slice(span), context.options.ignore_case),
        start,
        end,
        range: [span.start, span.end],
    });
}

fn push_strict_whitespace_tokens(
    tokens: &mut Vec<DetectionToken>,
    context: &TokenContext<'_>,
    span: ByteSpan,
    line_index: &LineIndex,
) {
    if context.options.mode != Mode::Strict {
        return;
    }
    let mut start = span.start;
    while start < span.end {
        let (end, kind) = scan_whitespace_token(context.content, start, span.end);
        push_token(
            tokens,
            context,
            kind,
            ByteSpan { start, end },
            line_index.location(start),
            line_index.location(end),
        );
        start = end.max(start + 1);
    }
}

fn scan_whitespace_token(content: &str, start: usize, limit: usize) -> (usize, TokenKind) {
    let bytes = content.as_bytes();
    if bytes[start] == b'\n' {
        return (start + 1, TokenKind::NewLine);
    }

    let mut end = start;
    while end < limit {
        let ch = content[end..].chars().next().unwrap_or('\0');
        if ch == '\n' || !ch.is_whitespace() {
            break;
        }
        end += ch.len_utf8();
    }
    (end, TokenKind::Empty)
}

#[cfg(test)]
mod tests;