1use std::str::FromStr;
2
3use cpd_core::hash::hash_token;
4use cpd_core::models::{DetectionToken, Token, TokenKind};
5
6#[derive(Debug, Clone)]
15pub struct TokenMap {
16 pub format: String,
17 pub tokens: Vec<DetectionToken>,
18}
19
20#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
21pub enum Mode {
22 #[default]
23 Mild,
24 Weak,
25 Strict,
26}
27
28impl FromStr for Mode {
29 type Err = ();
30
31 fn from_str(s: &str) -> Result<Self, Self::Err> {
32 match s {
33 "weak" => Ok(Self::Weak),
34 "strict" => Ok(Self::Strict),
35 _ => Ok(Self::Mild),
36 }
37 }
38}
39
40#[derive(Debug, Clone)]
50pub struct TokenizeOptions {
51 pub mode: Mode,
52 pub ignore_case: bool,
54 pub ignore_ranges: Vec<[usize; 2]>,
58 pub code_ignore_regexes: Vec<regex::Regex>,
62}
63
64impl TokenizeOptions {
65 pub fn new(mode: Mode) -> Self {
66 Self {
67 mode,
68 ignore_case: false,
69 ignore_ranges: Vec::new(),
70 code_ignore_regexes: Vec::new(),
71 }
72 }
73
74 pub fn with_code_ignore_patterns(mode: Mode, patterns: &[String]) -> Self {
77 let code_ignore_regexes: Vec<regex::Regex> = patterns
78 .iter()
79 .filter_map(|p| regex::Regex::new(p).ok())
80 .collect();
81 Self {
82 mode,
83 ignore_case: false,
84 ignore_ranges: Vec::new(),
85 code_ignore_regexes,
86 }
87 }
88}
89
90pub fn code_ignore_ranges(source: &str, regexes: &[regex::Regex]) -> Vec<[usize; 2]> {
95 let mut ranges = Vec::new();
96 for re in regexes {
97 for m in re.find_iter(source) {
98 ranges.push([m.start(), m.end()]);
99 }
100 }
101 ranges
102}
103
104#[allow(clippy::too_many_arguments)]
114#[inline]
115pub fn push_token(
116 tokens: &mut Vec<DetectionToken>,
117 kind: TokenKind,
118 value: &str,
119 byte_start: usize,
120 byte_end: usize,
121 start: cpd_core::models::Location,
122 end: cpd_core::models::Location,
123 options: &TokenizeOptions,
124) {
125 if kind == TokenKind::Ignore {
127 return;
128 }
129 if options
133 .ignore_ranges
134 .iter()
135 .any(|[rs, re]| byte_start < *re && byte_end > *rs)
136 {
137 return;
138 }
139 match options.mode {
141 Mode::Mild => {
142 if kind == TokenKind::Whitespace {
143 return;
144 }
145 }
146 Mode::Weak => {
147 if matches!(
148 kind,
149 TokenKind::Whitespace | TokenKind::Comment | TokenKind::BlockComment
150 ) {
151 return;
152 }
153 }
154 Mode::Strict => {} }
156 tokens.push(DetectionToken {
157 hash: hash_token(kind.discriminant(), value, options.ignore_case),
158 start,
159 end,
160 range: [byte_start, byte_end],
161 });
162}
163
164pub fn tokenize(format: &str, source: &str, mode: Mode) -> Vec<Token> {
170 let raw = dispatch_tokenizer(format, source, mode);
171 raw.into_iter().filter(|t| keep_token(t, mode)).collect()
174}
175
176fn keep_token(token: &Token, mode: Mode) -> bool {
177 if token.kind == TokenKind::Ignore {
178 return false;
179 }
180 match mode {
181 Mode::Mild => !matches!(token.kind, TokenKind::Whitespace),
182 Mode::Weak => !matches!(
183 token.kind,
184 TokenKind::Whitespace | TokenKind::Comment | TokenKind::BlockComment
185 ),
186 Mode::Strict => true,
187 }
188}
189
190pub fn tokenize_to_detection(
200 format: &str,
201 source: &str,
202 options: &TokenizeOptions,
203) -> Vec<DetectionToken> {
204 let raw = dispatch_tokenizer(format, source, options.mode);
212 let mut detection = Vec::with_capacity(raw.len());
213 for t in raw {
214 let byte_start = t.start.offset as usize;
215 let byte_end = t.end.offset as usize;
216 push_token(
217 &mut detection,
218 t.kind,
219 &t.value,
220 byte_start,
221 byte_end,
222 t.start,
223 t.end,
224 options,
225 );
226 }
227 detection
228}
229
230fn dispatch_tokenizer(format: &str, source: &str, mode: Mode) -> Vec<Token> {
231 match format {
232 "javascript" | "typescript" | "jsx" | "tsx" => {
233 crate::javascript::tokenize_js(source, format)
234 }
235 "vue" | "svelte" | "astro" => crate::sfc::tokenize_sfc(source, format, mode),
236 "markdown" | "md" => crate::markdown::tokenize_markdown(source, mode),
237 _ => crate::generic::tokenize_generic(source, format),
238 }
239}
240
241pub fn tokenize_to_detection_maps(
250 format: &str,
251 source: &str,
252 options: &TokenizeOptions,
253) -> Vec<TokenMap> {
254 match format {
255 "markdown" | "md" => crate::markdown::tokenize_markdown_maps(source, options),
256 "vue" | "svelte" | "astro" => crate::sfc::tokenize_sfc_maps(source, format, options),
257 _ => {
258 let tokens = tokenize_to_detection(format, source, options);
259 vec![TokenMap {
260 format: format.to_string(),
261 tokens,
262 }]
263 }
264 }
265}
266
267#[cfg(test)]
268mod tests {
269 use super::*;
270
271 #[test]
272 fn mode_from_str_defaults_to_mild() {
273 assert_eq!("unknown".parse::<Mode>().unwrap(), Mode::Mild);
274 assert_eq!("mild".parse::<Mode>().unwrap(), Mode::Mild);
275 }
276
277 #[test]
278 fn mode_from_str_weak() {
279 assert_eq!("weak".parse::<Mode>().unwrap(), Mode::Weak);
280 }
281
282 #[test]
283 fn mode_from_str_strict() {
284 assert_eq!("strict".parse::<Mode>().unwrap(), Mode::Strict);
285 }
286
287 #[test]
288 fn tokenize_to_detection_returns_detection_tokens() {
289 let opts = TokenizeOptions::new(Mode::Mild);
290 let tokens = tokenize_to_detection("javascript", "function hello() { return 42; }", &opts);
291 assert!(
292 !tokens.is_empty(),
293 "must produce DetectionTokens for valid JS"
294 );
295 }
296
297 #[test]
298 fn tokenize_to_detection_mild_excludes_whitespace() {
299 let opts = TokenizeOptions::new(Mode::Mild);
300 let mild = tokenize_to_detection("javascript", "a b c", &opts);
304 let strict =
305 tokenize_to_detection("javascript", "a b c", &TokenizeOptions::new(Mode::Strict));
306 let _ = (mild, strict);
310 }
311
312 #[test]
313 fn push_token_drops_ignore_kind() {
314 let mut tokens = Vec::new();
315 let loc = cpd_core::models::Location {
316 line: 1,
317 column: 0,
318 offset: 0,
319 };
320 let opts = TokenizeOptions::new(Mode::Mild);
321 push_token(
322 &mut tokens,
323 TokenKind::Ignore,
324 "secret",
325 0,
326 6,
327 loc.clone(),
328 loc,
329 &opts,
330 );
331 assert!(tokens.is_empty(), "Ignore-kind tokens must be dropped");
332 }
333
334 #[test]
335 fn push_token_drops_whitespace_in_mild_mode() {
336 let mut tokens = Vec::new();
337 let loc = cpd_core::models::Location {
338 line: 1,
339 column: 0,
340 offset: 0,
341 };
342 let opts = TokenizeOptions::new(Mode::Mild);
343 push_token(
344 &mut tokens,
345 TokenKind::Whitespace,
346 " ",
347 0,
348 1,
349 loc.clone(),
350 loc,
351 &opts,
352 );
353 assert!(tokens.is_empty(), "Whitespace must be dropped in Mild mode");
354 }
355
356 #[test]
357 fn push_token_keeps_whitespace_in_strict_mode() {
358 let mut tokens = Vec::new();
359 let loc = cpd_core::models::Location {
360 line: 1,
361 column: 0,
362 offset: 0,
363 };
364 let opts = TokenizeOptions::new(Mode::Strict);
365 push_token(
366 &mut tokens,
367 TokenKind::Whitespace,
368 " ",
369 0,
370 1,
371 loc.clone(),
372 loc,
373 &opts,
374 );
375 assert_eq!(tokens.len(), 1, "Whitespace must be kept in Strict mode");
376 }
377
378 #[test]
379 fn push_token_drops_comment_in_weak_mode() {
380 let mut tokens = Vec::new();
381 let loc = cpd_core::models::Location {
382 line: 1,
383 column: 0,
384 offset: 0,
385 };
386 let opts = TokenizeOptions::new(Mode::Weak);
387 push_token(
388 &mut tokens,
389 TokenKind::Comment,
390 "// note",
391 0,
392 7,
393 loc.clone(),
394 loc,
395 &opts,
396 );
397 assert!(tokens.is_empty(), "Comment must be dropped in Weak mode");
398 }
399
400 #[test]
401 fn push_token_ignore_case_folds_hash() {
402 let mut t1 = Vec::new();
403 let mut t2 = Vec::new();
404 let loc = cpd_core::models::Location {
405 line: 1,
406 column: 0,
407 offset: 0,
408 };
409 let mut opts = TokenizeOptions::new(Mode::Mild);
410 opts.ignore_case = true;
411 push_token(
412 &mut t1,
413 TokenKind::Identifier,
414 "Hello",
415 0,
416 5,
417 loc.clone(),
418 loc.clone(),
419 &opts,
420 );
421 push_token(
422 &mut t2,
423 TokenKind::Identifier,
424 "hello",
425 0,
426 5,
427 loc.clone(),
428 loc,
429 &opts,
430 );
431 assert_eq!(t1[0].hash, t2[0].hash, "ignore_case must fold case in hash");
432 }
433
434 #[test]
435 fn push_token_code_ignore_range_skips_overlapping_token() {
436 let mut tokens = Vec::new();
441 let loc = cpd_core::models::Location {
442 line: 1,
443 column: 0,
444 offset: 0,
445 };
446 let mut opts = TokenizeOptions::new(Mode::Mild);
447 opts.ignore_ranges = vec![[3, 18]];
449 push_token(
450 &mut tokens,
451 TokenKind::Identifier,
452 "foo",
453 0,
454 3,
455 loc.clone(),
456 loc.clone(),
457 &opts,
458 );
459 push_token(
460 &mut tokens,
461 TokenKind::Comment,
462 "// cpd-disable",
463 3,
464 18,
465 loc.clone(),
466 loc,
467 &opts,
468 );
469 assert_eq!(tokens.len(), 1, "only the non-matching token should remain");
470 assert_eq!(tokens[0].range, [0, 3]);
471 }
472
473 #[test]
474 fn push_token_code_ignore_range_no_overlap_keeps_all() {
475 let mut tokens = Vec::new();
477 let loc = cpd_core::models::Location {
478 line: 1,
479 column: 0,
480 offset: 0,
481 };
482 let mut opts = TokenizeOptions::new(Mode::Mild);
483 opts.ignore_ranges = vec![[100, 120]];
484 push_token(
485 &mut tokens,
486 TokenKind::Identifier,
487 "foo",
488 0,
489 3,
490 loc.clone(),
491 loc.clone(),
492 &opts,
493 );
494 push_token(
495 &mut tokens,
496 TokenKind::Identifier,
497 "bar",
498 3,
499 6,
500 loc.clone(),
501 loc,
502 &opts,
503 );
504 assert_eq!(
505 tokens.len(),
506 2,
507 "both tokens should remain when range doesn't overlap"
508 );
509 }
510
511 #[test]
512 fn code_ignore_ranges_computes_from_source_text() {
513 let source = "import foo from 'bar';\nconst x = 1;";
514 let re = regex::Regex::new(r"import\s+\w+\s+from").unwrap();
515 let ranges = code_ignore_ranges(source, &[re]);
516 assert_eq!(ranges.len(), 1, "should find one regex match");
517 assert_eq!(ranges[0], [0, 15]);
519 }
520
521 #[test]
522 fn code_ignore_ranges_multiple_patterns() {
523 let source = "// MIT License\nfunction foo() {}\n// Copyright";
524 let re1 = regex::Regex::new(r"//\s*MIT\s+License").unwrap();
525 let re2 = regex::Regex::new(r"//\s*Copyright").unwrap();
526 let ranges = code_ignore_ranges(source, &[re1, re2]);
527 assert_eq!(ranges.len(), 2, "should find two regex matches");
528 }
529
530 #[test]
531 fn code_ignore_ranges_empty_regexes() {
532 let source = "function foo() {}";
533 let ranges = code_ignore_ranges(source, &[]);
534 assert!(ranges.is_empty(), "no regexes means no ranges");
535 }
536
537 #[test]
538 fn with_code_ignore_patterns_builds_regexes() {
539 let opts = TokenizeOptions::with_code_ignore_patterns(
540 Mode::Mild,
541 &vec!["function".to_string(), r"//\s*cpd-disable".to_string()],
542 );
543 assert_eq!(opts.code_ignore_regexes.len(), 2);
544 assert!(opts.code_ignore_regexes[0].is_match("function"));
545 assert!(opts.code_ignore_regexes[1].is_match("// cpd-disable"));
546 assert!(!opts.code_ignore_regexes[1].is_match("function"));
547 }
548
549 #[test]
550 fn tokenize_to_detection_with_code_ignore_ranges_skips_imports() {
551 let source = "import * from 'lodash';\nconst x = 1;";
552 let regexes = vec![regex::Regex::new(r"import\s+\*\s+from").unwrap()];
553 let ranges = code_ignore_ranges(source, ®exes);
554 assert!(!ranges.is_empty(), "should find regex match in source");
555
556 let mut opts = TokenizeOptions::new(Mode::Mild);
557 opts.ignore_ranges = ranges;
558 let tokens = tokenize_to_detection("javascript", source, &opts);
559
560 let has_const = tokens.iter().any(|t| {
564 t.range[0] >= 24
566 });
567 assert!(
568 has_const,
569 "tokens after the import line should still be present"
570 );
571 }
572
573 #[test]
574 fn code_ignore_ranges_multi_token_match() {
575 let source = "import * from 'lodash';\nconst result = 42;";
578 let re = regex::Regex::new(r"import\s+.*?\s+from").unwrap();
579 let ranges = code_ignore_ranges(source, &[re]);
580 assert_eq!(
581 ranges.len(),
582 1,
583 "should find one regex match spanning import statement"
584 );
585 assert!(ranges[0][0] == 0, "match should start at beginning");
586 assert!(ranges[0][1] > 0, "match should have non-zero end");
587 }
588}