use std::str::FromStr;
use cpd_core::hash::hash_token;
use cpd_core::models::{DetectionToken, Token, TokenKind};
#[derive(Debug, Clone)]
pub struct TokenMap {
pub format: String,
pub tokens: Vec<DetectionToken>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum Mode {
#[default]
Mild,
Weak,
Strict,
}
impl FromStr for Mode {
type Err = ();
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"weak" => Ok(Self::Weak),
"strict" => Ok(Self::Strict),
_ => Ok(Self::Mild),
}
}
}
#[derive(Debug, Clone)]
pub struct TokenizeOptions {
pub mode: Mode,
pub ignore_case: bool,
pub ignore_ranges: Vec<[usize; 2]>,
pub code_ignore_regexes: Vec<regex::Regex>,
}
impl TokenizeOptions {
pub fn new(mode: Mode) -> Self {
Self {
mode,
ignore_case: false,
ignore_ranges: Vec::new(),
code_ignore_regexes: Vec::new(),
}
}
pub fn with_code_ignore_patterns(mode: Mode, patterns: &[String]) -> Self {
let code_ignore_regexes: Vec<regex::Regex> = patterns
.iter()
.filter_map(|p| regex::Regex::new(p).ok())
.collect();
Self {
mode,
ignore_case: false,
ignore_ranges: Vec::new(),
code_ignore_regexes,
}
}
}
pub fn code_ignore_ranges(source: &str, regexes: &[regex::Regex]) -> Vec<[usize; 2]> {
let mut ranges = Vec::new();
for re in regexes {
for m in re.find_iter(source) {
ranges.push([m.start(), m.end()]);
}
}
ranges
}
#[allow(clippy::too_many_arguments)]
#[inline]
pub fn push_token(
tokens: &mut Vec<DetectionToken>,
kind: TokenKind,
value: &str,
byte_start: usize,
byte_end: usize,
start: cpd_core::models::Location,
end: cpd_core::models::Location,
options: &TokenizeOptions,
) {
if kind == TokenKind::Ignore {
return;
}
if options
.ignore_ranges
.iter()
.any(|[rs, re]| byte_start < *re && byte_end > *rs)
{
return;
}
match options.mode {
Mode::Mild => {
if kind == TokenKind::Whitespace {
return;
}
}
Mode::Weak => {
if matches!(
kind,
TokenKind::Whitespace | TokenKind::Comment | TokenKind::BlockComment
) {
return;
}
}
Mode::Strict => {} }
tokens.push(DetectionToken {
hash: hash_token(kind.discriminant(), value, options.ignore_case),
start,
end,
range: [byte_start, byte_end],
});
}
pub fn tokenize(format: &str, source: &str, mode: Mode) -> Vec<Token> {
let raw = dispatch_tokenizer(format, source, mode);
raw.into_iter().filter(|t| keep_token(t, mode)).collect()
}
fn keep_token(token: &Token, mode: Mode) -> bool {
if token.kind == TokenKind::Ignore {
return false;
}
match mode {
Mode::Mild => !matches!(token.kind, TokenKind::Whitespace),
Mode::Weak => !matches!(
token.kind,
TokenKind::Whitespace | TokenKind::Comment | TokenKind::BlockComment
),
Mode::Strict => true,
}
}
pub fn tokenize_to_detection(
format: &str,
source: &str,
options: &TokenizeOptions,
) -> Vec<DetectionToken> {
let raw = dispatch_tokenizer(format, source, options.mode);
let mut detection = Vec::with_capacity(raw.len());
for t in raw {
let byte_start = t.start.offset as usize;
let byte_end = t.end.offset as usize;
push_token(
&mut detection,
t.kind,
&t.value,
byte_start,
byte_end,
t.start,
t.end,
options,
);
}
detection
}
fn dispatch_tokenizer(format: &str, source: &str, mode: Mode) -> Vec<Token> {
match format {
"javascript" | "typescript" | "jsx" | "tsx" => {
crate::javascript::tokenize_js(source, format)
}
"vue" | "svelte" | "astro" => crate::sfc::tokenize_sfc(source, format, mode),
"markdown" | "md" => crate::markdown::tokenize_markdown(source, mode),
_ => crate::generic::tokenize_generic(source, format),
}
}
pub fn tokenize_to_detection_maps(
format: &str,
source: &str,
options: &TokenizeOptions,
) -> Vec<TokenMap> {
match format {
"markdown" | "md" => crate::markdown::tokenize_markdown_maps(source, options),
"vue" | "svelte" | "astro" => crate::sfc::tokenize_sfc_maps(source, format, options),
_ => {
let tokens = tokenize_to_detection(format, source, options);
vec![TokenMap {
format: format.to_string(),
tokens,
}]
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn mode_from_str_defaults_to_mild() {
assert_eq!("unknown".parse::<Mode>().unwrap(), Mode::Mild);
assert_eq!("mild".parse::<Mode>().unwrap(), Mode::Mild);
}
#[test]
fn mode_from_str_weak() {
assert_eq!("weak".parse::<Mode>().unwrap(), Mode::Weak);
}
#[test]
fn mode_from_str_strict() {
assert_eq!("strict".parse::<Mode>().unwrap(), Mode::Strict);
}
#[test]
fn tokenize_to_detection_returns_detection_tokens() {
let opts = TokenizeOptions::new(Mode::Mild);
let tokens = tokenize_to_detection("javascript", "function hello() { return 42; }", &opts);
assert!(
!tokens.is_empty(),
"must produce DetectionTokens for valid JS"
);
}
#[test]
fn tokenize_to_detection_mild_excludes_whitespace() {
let opts = TokenizeOptions::new(Mode::Mild);
let mild = tokenize_to_detection("javascript", "a b c", &opts);
let strict =
tokenize_to_detection("javascript", "a b c", &TokenizeOptions::new(Mode::Strict));
let _ = (mild, strict);
}
#[test]
fn push_token_drops_ignore_kind() {
let mut tokens = Vec::new();
let loc = cpd_core::models::Location {
line: 1,
column: 0,
offset: 0,
};
let opts = TokenizeOptions::new(Mode::Mild);
push_token(
&mut tokens,
TokenKind::Ignore,
"secret",
0,
6,
loc.clone(),
loc,
&opts,
);
assert!(tokens.is_empty(), "Ignore-kind tokens must be dropped");
}
#[test]
fn push_token_drops_whitespace_in_mild_mode() {
let mut tokens = Vec::new();
let loc = cpd_core::models::Location {
line: 1,
column: 0,
offset: 0,
};
let opts = TokenizeOptions::new(Mode::Mild);
push_token(
&mut tokens,
TokenKind::Whitespace,
" ",
0,
1,
loc.clone(),
loc,
&opts,
);
assert!(tokens.is_empty(), "Whitespace must be dropped in Mild mode");
}
#[test]
fn push_token_keeps_whitespace_in_strict_mode() {
let mut tokens = Vec::new();
let loc = cpd_core::models::Location {
line: 1,
column: 0,
offset: 0,
};
let opts = TokenizeOptions::new(Mode::Strict);
push_token(
&mut tokens,
TokenKind::Whitespace,
" ",
0,
1,
loc.clone(),
loc,
&opts,
);
assert_eq!(tokens.len(), 1, "Whitespace must be kept in Strict mode");
}
#[test]
fn push_token_drops_comment_in_weak_mode() {
let mut tokens = Vec::new();
let loc = cpd_core::models::Location {
line: 1,
column: 0,
offset: 0,
};
let opts = TokenizeOptions::new(Mode::Weak);
push_token(
&mut tokens,
TokenKind::Comment,
"// note",
0,
7,
loc.clone(),
loc,
&opts,
);
assert!(tokens.is_empty(), "Comment must be dropped in Weak mode");
}
#[test]
fn push_token_ignore_case_folds_hash() {
let mut t1 = Vec::new();
let mut t2 = Vec::new();
let loc = cpd_core::models::Location {
line: 1,
column: 0,
offset: 0,
};
let mut opts = TokenizeOptions::new(Mode::Mild);
opts.ignore_case = true;
push_token(
&mut t1,
TokenKind::Identifier,
"Hello",
0,
5,
loc.clone(),
loc.clone(),
&opts,
);
push_token(
&mut t2,
TokenKind::Identifier,
"hello",
0,
5,
loc.clone(),
loc,
&opts,
);
assert_eq!(t1[0].hash, t2[0].hash, "ignore_case must fold case in hash");
}
#[test]
fn push_token_code_ignore_range_skips_overlapping_token() {
let mut tokens = Vec::new();
let loc = cpd_core::models::Location {
line: 1,
column: 0,
offset: 0,
};
let mut opts = TokenizeOptions::new(Mode::Mild);
opts.ignore_ranges = vec![[3, 18]];
push_token(
&mut tokens,
TokenKind::Identifier,
"foo",
0,
3,
loc.clone(),
loc.clone(),
&opts,
);
push_token(
&mut tokens,
TokenKind::Comment,
"// cpd-disable",
3,
18,
loc.clone(),
loc,
&opts,
);
assert_eq!(tokens.len(), 1, "only the non-matching token should remain");
assert_eq!(tokens[0].range, [0, 3]);
}
#[test]
fn push_token_code_ignore_range_no_overlap_keeps_all() {
let mut tokens = Vec::new();
let loc = cpd_core::models::Location {
line: 1,
column: 0,
offset: 0,
};
let mut opts = TokenizeOptions::new(Mode::Mild);
opts.ignore_ranges = vec![[100, 120]];
push_token(
&mut tokens,
TokenKind::Identifier,
"foo",
0,
3,
loc.clone(),
loc.clone(),
&opts,
);
push_token(
&mut tokens,
TokenKind::Identifier,
"bar",
3,
6,
loc.clone(),
loc,
&opts,
);
assert_eq!(
tokens.len(),
2,
"both tokens should remain when range doesn't overlap"
);
}
#[test]
fn code_ignore_ranges_computes_from_source_text() {
let source = "import foo from 'bar';\nconst x = 1;";
let re = regex::Regex::new(r"import\s+\w+\s+from").unwrap();
let ranges = code_ignore_ranges(source, &[re]);
assert_eq!(ranges.len(), 1, "should find one regex match");
assert_eq!(ranges[0], [0, 15]);
}
#[test]
fn code_ignore_ranges_multiple_patterns() {
let source = "// MIT License\nfunction foo() {}\n// Copyright";
let re1 = regex::Regex::new(r"//\s*MIT\s+License").unwrap();
let re2 = regex::Regex::new(r"//\s*Copyright").unwrap();
let ranges = code_ignore_ranges(source, &[re1, re2]);
assert_eq!(ranges.len(), 2, "should find two regex matches");
}
#[test]
fn code_ignore_ranges_empty_regexes() {
let source = "function foo() {}";
let ranges = code_ignore_ranges(source, &[]);
assert!(ranges.is_empty(), "no regexes means no ranges");
}
#[test]
fn with_code_ignore_patterns_builds_regexes() {
let opts = TokenizeOptions::with_code_ignore_patterns(
Mode::Mild,
&vec!["function".to_string(), r"//\s*cpd-disable".to_string()],
);
assert_eq!(opts.code_ignore_regexes.len(), 2);
assert!(opts.code_ignore_regexes[0].is_match("function"));
assert!(opts.code_ignore_regexes[1].is_match("// cpd-disable"));
assert!(!opts.code_ignore_regexes[1].is_match("function"));
}
#[test]
fn tokenize_to_detection_with_code_ignore_ranges_skips_imports() {
let source = "import * from 'lodash';\nconst x = 1;";
let regexes = vec![regex::Regex::new(r"import\s+\*\s+from").unwrap()];
let ranges = code_ignore_ranges(source, ®exes);
assert!(!ranges.is_empty(), "should find regex match in source");
let mut opts = TokenizeOptions::new(Mode::Mild);
opts.ignore_ranges = ranges;
let tokens = tokenize_to_detection("javascript", source, &opts);
let has_const = tokens.iter().any(|t| {
t.range[0] >= 24
});
assert!(
has_const,
"tokens after the import line should still be present"
);
}
#[test]
fn code_ignore_ranges_multi_token_match() {
let source = "import * from 'lodash';\nconst result = 42;";
let re = regex::Regex::new(r"import\s+.*?\s+from").unwrap();
let ranges = code_ignore_ranges(source, &[re]);
assert_eq!(
ranges.len(),
1,
"should find one regex match spanning import statement"
);
assert!(ranges[0][0] == 0, "match should start at beginning");
assert!(ranges[0][1] > 0, "match should have non-zero end");
}
}