jscpd-rs 0.1.6 - Docs.rs

use crate::cli::{Mode, Options};

use super::scan::scan_block_comment;
use super::{
    ByteSpan, DetectionToken, LineIndex, TokenContext, TokenKind, push_strict_whitespace_tokens,
    push_token,
};

pub(super) fn tokenize_generic(
    content: &str,
    format: &str,
    options: &Options,
    ignore_regions: &[[usize; 2]],
) -> Vec<DetectionToken> {
    let context = TokenContext {
        content,
        options,
        ignore_regions,
    };
    let line_index = LineIndex::new(content);
    let mut tokens = Vec::new();
    let mut start_byte = 0usize;

    while start_byte < content.len() {
        let ch = content[start_byte..].chars().next().unwrap_or('\0');
        if ch.is_whitespace() {
            let whitespace_end = scan_whitespace(content, start_byte);
            if options.mode == Mode::Strict {
                push_strict_whitespace_tokens(
                    &mut tokens,
                    &context,
                    ByteSpan {
                        start: start_byte,
                        end: whitespace_end,
                    },
                    &line_index,
                );
            } else if format == "twig"
                && twig_keeps_mild_whitespace(content, start_byte, whitespace_end)
            {
                // Prism's Twig grammar labels these spans as `default`, so
                // upstream mild mode keeps them while filtering empty/new_line.
                push_token(
                    &mut tokens,
                    &context,
                    TokenKind::Default,
                    ByteSpan {
                        start: start_byte,
                        end: whitespace_end,
                    },
                    line_index.location(start_byte),
                    line_index.location(whitespace_end),
                );
            }
            start_byte = whitespace_end.max(start_byte + ch.len_utf8());
            continue;
        }

        let (end_byte, kind) = if let Some((special_end, special_kind)) =
            generic_multiline_span_end(content, format, start_byte, content.len())
        {
            (special_end, special_kind)
        } else if let Some(comment_end) =
            generic_comment_span_end(content, format, start_byte, content.len())
        {
            (comment_end, TokenKind::Comment)
        } else if format == "yaml" && matches!(ch, '"' | '\'') {
            (scan_quoted_string(content, start_byte), TokenKind::String)
        } else if punctuation_split_format(format) {
            scan_punctuation_split_token(content, format, start_byte)
        } else {
            (scan_generic_token(content, start_byte), TokenKind::Default)
        };
        push_token(
            &mut tokens,
            &context,
            kind,
            ByteSpan {
                start: start_byte,
                end: end_byte,
            },
            line_index.location(start_byte),
            line_index.location(end_byte),
        );
        start_byte = end_byte.max(start_byte + ch.len_utf8());
    }

    tokens
}

pub(super) fn scan_generic_token(content: &str, start: usize) -> usize {
    let mut end = start;
    while end < content.len() {
        let ch = content[end..].chars().next().unwrap_or('\0');
        if ch.is_whitespace() {
            break;
        }
        end += ch.len_utf8();
    }
    end
}

pub(super) fn scan_punctuation_split_token(
    content: &str,
    format: &str,
    start: usize,
) -> (usize, TokenKind) {
    let ch = content[start..].chars().next().unwrap_or('\0');
    if is_split_punctuation(format, ch) {
        return (start + ch.len_utf8(), TokenKind::Punctuation);
    }
    if code_like_format(format) && is_operator_start(ch) {
        return (scan_operator_token(content, start), TokenKind::Operator);
    }

    let mut end = start;
    while end < content.len() {
        let ch = content[end..].chars().next().unwrap_or('\0');
        if ch.is_whitespace()
            || is_split_punctuation(format, ch)
            || (code_like_format(format) && is_operator_start(ch))
        {
            break;
        }
        end += ch.len_utf8();
    }
    (end, TokenKind::Default)
}

fn scan_operator_token(content: &str, start: usize) -> usize {
    let mut end = start;
    while end < content.len() {
        let ch = content[end..].chars().next().unwrap_or('\0');
        if !is_operator_start(ch) {
            break;
        }
        end += ch.len_utf8();
    }
    end
}

fn scan_quoted_string(content: &str, start: usize) -> usize {
    let quote = content[start..].chars().next().unwrap_or('\0');
    let mut escaped = false;
    let mut end = start + quote.len_utf8();
    while end < content.len() {
        let ch = content[end..].chars().next().unwrap_or('\0');
        end += ch.len_utf8();
        if escaped {
            escaped = false;
            continue;
        }
        if ch == '\\' {
            escaped = true;
            continue;
        }
        if ch == quote || matches!(ch, '\n' | '\r') {
            break;
        }
    }
    end
}

fn generic_multiline_span_end(
    content: &str,
    format: &str,
    start: usize,
    limit: usize,
) -> Option<(usize, TokenKind)> {
    match format {
        "haml" => haml_multiline_comment_span_end(content, start, limit)
            .map(|end| (end, TokenKind::Comment)),
        "pug" => pug_dot_block_span_end(content, start, limit).map(|end| (end, TokenKind::Default)),
        _ => None,
    }
}

fn haml_multiline_comment_span_end(content: &str, start: usize, limit: usize) -> Option<usize> {
    let bytes = content.as_bytes();
    let line_start = line_start(bytes, start);
    if !line_prefix_is_indent(bytes, line_start, start) {
        return None;
    }

    let rest = &bytes[start..limit];
    if !(rest.starts_with(b"-#") || rest.starts_with(b"/")) {
        return None;
    }

    Some(scan_indented_block_end(
        bytes, line_start, start, limit, false,
    ))
}

fn pug_dot_block_span_end(content: &str, start: usize, limit: usize) -> Option<usize> {
    let bytes = content.as_bytes();
    let line_start = line_start(bytes, start);
    if !line_prefix_is_indent(bytes, line_start, start) {
        return None;
    }

    let line_end = line_content_end(bytes, start, limit);
    if !is_pug_dot_block_opener(&content[start..line_end]) {
        return None;
    }

    let end = scan_indented_block_end(bytes, line_start, start, limit, true);
    (end > line_end).then_some(end)
}

fn scan_indented_block_end(
    bytes: &[u8],
    line_start: usize,
    start: usize,
    limit: usize,
    include_blank_lines: bool,
) -> usize {
    let base_indent = start.saturating_sub(line_start);
    let mut end = line_content_end(bytes, start, limit);
    let mut next_start = next_line_start(bytes, end, limit);

    while next_start < limit {
        let line_end = line_content_end(bytes, next_start, limit);
        let indent_end = scan_indent(bytes, next_start, line_end);
        let is_blank = indent_end == line_end;
        let is_child = indent_end.saturating_sub(next_start) > base_indent;
        if is_child || (include_blank_lines && is_blank) {
            end = line_end;
            next_start = next_line_start(bytes, line_end, limit);
        } else {
            break;
        }
    }

    end
}

fn is_pug_dot_block_opener(line: &str) -> bool {
    let trimmed = line.trim_end_matches([' ', '\t']);
    let Some(head) = trimmed.strip_suffix('.') else {
        return false;
    };
    !head.eq_ignore_ascii_case("script")
        && !head.is_empty()
        && head
            .bytes()
            .all(|byte| byte.is_ascii_alphanumeric() || matches!(byte, b'_' | b'-' | b'#' | b'.'))
}

fn is_split_punctuation(format: &str, ch: char) -> bool {
    matches!(ch, '{' | '}' | '(' | ')' | '[' | ']' | ':' | ';' | ',')
        || (code_like_format(format) && ch == '.')
}

fn is_operator_start(ch: char) -> bool {
    matches!(
        ch,
        '+' | '-' | '*' | '/' | '%' | '=' | '!' | '<' | '>' | '&' | '|' | '^' | '~' | '?'
    )
}

pub(super) fn scan_whitespace(content: &str, start: usize) -> usize {
    let mut end = start;
    while end < content.len() {
        let ch = content[end..].chars().next().unwrap_or('\0');
        if !ch.is_whitespace() {
            break;
        }
        end += ch.len_utf8();
    }
    end
}

fn twig_keeps_mild_whitespace(content: &str, start: usize, end: usize) -> bool {
    if start >= end {
        return false;
    }

    let has_newline = content[start..end].bytes().any(|byte| byte == b'\n');
    if !has_newline {
        return previous_non_whitespace(content, start).is_some()
            && next_non_whitespace(content, end).is_some();
    }

    matches!(
        (
            previous_non_whitespace(content, start),
            next_non_whitespace(content, end)
        ),
        (Some(b'>'), Some(b'<'))
    )
}

fn previous_non_whitespace(content: &str, end: usize) -> Option<u8> {
    content[..end]
        .bytes()
        .rev()
        .find(|byte| !byte.is_ascii_whitespace())
}

fn next_non_whitespace(content: &str, start: usize) -> Option<u8> {
    content[start..]
        .bytes()
        .find(|byte| !byte.is_ascii_whitespace())
}

pub(super) fn generic_comment_span_end(
    content: &str,
    format: &str,
    start: usize,
    limit: usize,
) -> Option<usize> {
    let bytes = content.as_bytes();
    let rest = &bytes[start..limit];
    if rest.starts_with(b"<!--") {
        return Some(scan_html_comment(bytes, start, limit));
    }
    if rest.starts_with(b"/*") {
        return Some(scan_block_comment(bytes, start, limit));
    }
    if rest.starts_with(b"//") {
        return Some(scan_to_line_end(bytes, start, limit));
    }
    if rest.starts_with(b"--") && generic_double_dash_comment_format(format) {
        return Some(scan_to_line_end(bytes, start, limit));
    }
    if bytes[start] == b'#' && generic_hash_comment_format(format) {
        return Some(scan_to_line_end(bytes, start, limit));
    }
    if bytes[start] == b';' && generic_semicolon_comment_format(format) {
        return Some(scan_to_line_end(bytes, start, limit));
    }
    None
}

fn generic_hash_comment_format(format: &str) -> bool {
    matches!(
        format,
        "apacheconf"
            | "applescript"
            | "bash"
            | "cmake"
            | "docker"
            | "editorconfig"
            | "git"
            | "ignore"
            | "ini"
            | "julia"
            | "makefile"
            | "nginx"
            | "nix"
            | "perl"
            | "powershell"
            | "properties"
            | "python"
            | "r"
            | "ruby"
            | "shell-session"
            | "tcl"
            | "toml"
            | "vim"
            | "yaml"
    )
}

fn generic_double_dash_comment_format(format: &str) -> bool {
    matches!(
        format,
        "ada" | "applescript" | "elm" | "haskell" | "lua" | "plsql" | "sql"
    )
}

fn generic_semicolon_comment_format(format: &str) -> bool {
    matches!(
        format,
        "asm6502"
            | "autoit"
            | "autohotkey"
            | "clojure"
            | "ini"
            | "lisp"
            | "llvm"
            | "nasm"
            | "racket"
            | "scheme"
    )
}

fn punctuation_split_format(format: &str) -> bool {
    css_like_format(format) || code_like_format(format)
}

fn css_like_format(format: &str) -> bool {
    matches!(format, "css" | "less" | "sass" | "scss" | "stylus")
}

fn code_like_format(format: &str) -> bool {
    matches!(
        format,
        "ada"
            | "apex"
            | "aspnet"
            | "c"
            | "c-header"
            | "clike"
            | "clojure"
            | "cmake"
            | "coffeescript"
            | "cpp"
            | "cpp-header"
            | "csharp"
            | "csv"
            | "cfml"
            | "cfscript"
            | "dart"
            | "dot"
            | "eiffel"
            | "go"
            | "haml"
            | "ini"
            | "java"
            | "kotlin"
            | "haxe"
            | "markup"
            | "objectivec"
            | "ocaml"
            | "perl"
            | "php"
            | "plsql"
            | "properties"
            | "purescript"
            | "python"
            | "qsharp"
            | "r"
            | "rescript"
            | "robotframework"
            | "rust"
            | "scala"
            | "solidity"
            | "sparql"
            | "swift"
            | "tcl"
            | "tt2"
            | "turtle"
            | "twig"
            | "verilog"
            | "wgsl"
            | "yaml"
            | "zig"
    )
}

fn scan_to_line_end(bytes: &[u8], start: usize, limit: usize) -> usize {
    let mut idx = start;
    while idx < limit && bytes[idx] != b'\n' {
        idx += 1;
    }
    idx
}

fn line_start(bytes: &[u8], start: usize) -> usize {
    let mut idx = start;
    while idx > 0 && !matches!(bytes[idx - 1], b'\n' | b'\r') {
        idx -= 1;
    }
    idx
}

fn line_prefix_is_indent(bytes: &[u8], line_start: usize, start: usize) -> bool {
    bytes[line_start..start]
        .iter()
        .all(|byte| matches!(byte, b' ' | b'\t'))
}

fn line_content_end(bytes: &[u8], start: usize, limit: usize) -> usize {
    let mut idx = start;
    while idx < limit && !matches!(bytes[idx], b'\n' | b'\r') {
        idx += 1;
    }
    idx
}

fn next_line_start(bytes: &[u8], line_end: usize, limit: usize) -> usize {
    if line_end >= limit {
        return limit;
    }
    if bytes[line_end] == b'\r' && line_end + 1 < limit && bytes[line_end + 1] == b'\n' {
        line_end + 2
    } else {
        line_end + 1
    }
}

fn scan_indent(bytes: &[u8], start: usize, limit: usize) -> usize {
    let mut idx = start;
    while idx < limit && matches!(bytes[idx], b' ' | b'\t') {
        idx += 1;
    }
    idx
}

fn scan_html_comment(bytes: &[u8], start: usize, limit: usize) -> usize {
    let mut idx = start + 4;
    while idx + 2 < limit {
        if bytes[idx] == b'-' && bytes[idx + 1] == b'-' && bytes[idx + 2] == b'>' {
            return idx + 3;
        }
        idx += 1;
    }
    limit
}