jscpd-rs 0.1.3

50x+ faster duplicate-code detector for CI/CD; jscpd-compatible CLI, SARIF, JSON, HTML reports
Documentation
use super::super::scan::{scan_block_comment, scan_line_comment};
use super::super::{
    ByteSpan, DetectionToken, LineIndex, TokenContext, TokenKind, push_strict_whitespace_tokens,
    push_token,
};
use super::lexical::{is_js_constant, is_js_keyword};
use super::push_line_comment_tokens;
use super::scan_regex_literal_end;

pub(super) fn tokenize_js_like_range(
    tokens: &mut Vec<DetectionToken>,
    context: &TokenContext<'_>,
    range_start: usize,
    range_end: usize,
    line_index: &LineIndex,
) {
    let bytes = context.content.as_bytes();
    let mut idx = range_start;

    while idx < range_end {
        let ch = context.content[idx..].chars().next().unwrap_or('\0');
        if ch.is_whitespace() {
            let whitespace_end = scan_whitespace(context.content, idx, range_end);
            push_strict_whitespace_tokens(
                tokens,
                context,
                ByteSpan {
                    start: idx,
                    end: whitespace_end,
                },
                line_index,
            );
            idx = whitespace_end.max(idx + ch.len_utf8());
            continue;
        }

        if idx + 1 < range_end && bytes[idx] == b'/' && bytes[idx + 1] == b'/' {
            let end = scan_line_comment(bytes, idx, range_end);
            if context.options.mode != crate::cli::Mode::Weak {
                push_line_comment_tokens(tokens, context, ByteSpan { start: idx, end }, line_index);
            }
            idx = end.max(idx + 1);
            continue;
        }

        let (end, kind) = if idx + 1 < range_end && bytes[idx] == b'/' && bytes[idx + 1] == b'*' {
            (
                scan_block_comment(bytes, idx, range_end),
                TokenKind::Comment,
            )
        } else if bytes[idx] == b'/' {
            if let Some(end) = scan_regex_literal_end(context.content, idx, range_end) {
                (end, TokenKind::String)
            } else {
                scan_operator_or_punctuation(bytes, idx, range_end)
            }
        } else if bytes[idx] == b'`' {
            (
                scan_template_literal(bytes, idx, range_end).unwrap_or(range_end),
                TokenKind::String,
            )
        } else if matches!(bytes[idx], b'\'' | b'"') {
            if let Some(end) = scan_closed_string(bytes, idx, bytes[idx], range_end) {
                (end, TokenKind::String)
            } else {
                (
                    scan_unclosed_quote_fragment(context.content, idx, range_end),
                    TokenKind::Default,
                )
            }
        } else if is_identifier_start(ch) {
            let end = scan_identifier(context.content, idx, range_end);
            let value = &context.content[idx..end];
            let kind = if is_js_constant(value) {
                TokenKind::Constant
            } else if is_js_keyword(value) {
                TokenKind::Keyword
            } else {
                TokenKind::Default
            };
            (end, kind)
        } else if bytes[idx].is_ascii_digit() {
            (scan_number(bytes, idx, range_end), TokenKind::Number)
        } else {
            scan_operator_or_punctuation(bytes, idx, range_end)
        };

        push_token(
            tokens,
            context,
            kind,
            ByteSpan { start: idx, end },
            line_index.location(idx),
            line_index.location(end),
        );
        idx = end.max(idx + 1);
    }
}

fn scan_closed_string(bytes: &[u8], start: usize, quote: u8, limit: usize) -> Option<usize> {
    let mut idx = start + 1;
    while idx < limit {
        if bytes[idx] == b'\\' {
            idx = (idx + 2).min(limit);
            continue;
        }
        if matches!(bytes[idx], b'\n' | b'\r') {
            return None;
        }
        if bytes[idx] == quote {
            return Some(idx + 1);
        }
        idx += 1;
    }
    None
}

fn scan_template_literal(bytes: &[u8], start: usize, limit: usize) -> Option<usize> {
    let mut idx = start + 1;
    while idx < limit {
        if bytes[idx] == b'\\' {
            idx = (idx + 2).min(limit);
            continue;
        }
        if bytes[idx] == b'`' {
            return Some(idx + 1);
        }
        idx += 1;
    }
    None
}

fn scan_unclosed_quote_fragment(content: &str, start: usize, limit: usize) -> usize {
    let bytes = content.as_bytes();
    let mut idx = start + 1;
    while idx < limit {
        let ch = content[idx..].chars().next().unwrap_or('\0');
        if ch.is_whitespace() || is_js_text_delimiter(bytes[idx]) {
            break;
        }
        idx += ch.len_utf8();
    }
    idx
}

fn scan_whitespace(content: &str, start: usize, limit: usize) -> usize {
    let mut end = start;
    while end < limit {
        let ch = content[end..].chars().next().unwrap_or('\0');
        if !ch.is_whitespace() {
            break;
        }
        end += ch.len_utf8();
    }
    end
}

fn scan_identifier(content: &str, start: usize, limit: usize) -> usize {
    let mut idx = start;
    while idx < limit {
        let ch = content[idx..].chars().next().unwrap_or('\0');
        if !is_identifier_continue(ch) {
            break;
        }
        idx += ch.len_utf8();
    }
    idx
}

fn scan_number(bytes: &[u8], start: usize, limit: usize) -> usize {
    let mut idx = start;
    while idx < limit
        && (bytes[idx].is_ascii_alphanumeric() || matches!(bytes[idx], b'.' | b'_' | b'+' | b'-'))
    {
        idx += 1;
    }
    idx
}

fn scan_operator_or_punctuation(bytes: &[u8], start: usize, limit: usize) -> (usize, TokenKind) {
    const OPERATORS: &[&[u8]] = &[
        b">>>=", b"===", b"!==", b">>>", b"<<=", b">>=", b"**=", b"=>", b"==", b"!=", b"<=", b">=",
        b"++", b"--", b"&&", b"||", b"??", b"?.", b"...", b"+=", b"-=", b"*=", b"/=", b"%=", b"&=",
        b"|=", b"^=", b"<<", b">>", b"**",
    ];
    for operator in OPERATORS {
        if bytes[start..limit].starts_with(operator) {
            return (start + operator.len(), TokenKind::Operator);
        }
    }
    let kind = if matches!(
        bytes[start],
        b'{' | b'}' | b'[' | b']' | b'(' | b')' | b';' | b',' | b':' | b'.'
    ) {
        TokenKind::Punctuation
    } else {
        TokenKind::Operator
    };
    (start + 1, kind)
}

fn is_js_text_delimiter(byte: u8) -> bool {
    matches!(
        byte,
        b'{' | b'}'
            | b'['
            | b']'
            | b'('
            | b')'
            | b';'
            | b','
            | b':'
            | b'.'
            | b'<'
            | b'>'
            | b'='
            | b'+'
            | b'-'
            | b'*'
            | b'/'
            | b'%'
            | b'&'
            | b'|'
            | b'^'
            | b'!'
            | b'?'
            | b'~'
    )
}

fn is_identifier_start(ch: char) -> bool {
    ch == '_' || ch == '$' || ch.is_ascii_alphabetic() || (ch as u32) > 0x7f
}

fn is_identifier_continue(ch: char) -> bool {
    is_identifier_start(ch) || ch.is_ascii_digit()
}

#[cfg(test)]
mod tests {
    use crate::cli::Options;
    use crate::tokenizer::hash_token;

    use super::*;

    #[test]
    fn multiline_template_literals_are_single_string_tokens() {
        let content = "expect(store).toMatchInlineSnapshot(`\n  [root]\n`);\n";
        let options = Options::default();
        let line_index = LineIndex::new(content);
        let context = TokenContext {
            content,
            options: &options,
            ignore_regions: &[],
        };
        let mut tokens = Vec::new();

        tokenize_js_like_range(&mut tokens, &context, 0, content.len(), &line_index);

        let template = tokens
            .iter()
            .find(|token| content[token.range[0]..token.range[1]].starts_with('`'))
            .expect("template token");
        assert_eq!(
            &content[template.range[0]..template.range[1]],
            "`\n  [root]\n`"
        );
        assert_eq!(template.start.line, 1);
        assert_eq!(template.end.line, 3);
        assert_eq!(
            template.hash,
            hash_token(TokenKind::String, "`\n  [root]\n`", false)
        );
    }
}