cpd_tokenizer/
tokenizer.rs

1use std::str::FromStr;
2
3use cpd_core::hash::hash_token;
4use cpd_core::models::{DetectionToken, Token, TokenKind};
5
6/// A sub-format detection map produced by multi-format tokenizers.
7///
8/// For single-format files, `tokenize_to_detection_maps()` returns exactly one
9/// TokenMap with the same format as the file.
10///
11/// For multi-format files (markdown, SFC), one TokenMap is returned per
12/// detected sub-language, each carrying tokens that should enter that
13/// format's detection pool.
14#[derive(Debug, Clone)]
15pub struct TokenMap {
16    pub format: String,
17    pub tokens: Vec<DetectionToken>,
18}
19
20#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
21pub enum Mode {
22    #[default]
23    Mild,
24    Weak,
25    Strict,
26}
27
28impl FromStr for Mode {
29    type Err = ();
30
31    fn from_str(s: &str) -> Result<Self, Self::Err> {
32        match s {
33            "weak" => Ok(Self::Weak),
34            "strict" => Ok(Self::Strict),
35            _ => Ok(Self::Mild),
36        }
37    }
38}
39
40/// Options for the detection-path tokenizer.
41///
42/// Carries mode, case-folding flag, pre-parsed ignore-region byte ranges,
43/// and pre-compiled code-level regex patterns that skip matching tokens during detection.
44///
45/// Code-level ignore patterns (v4 `ignorePattern`) work by matching regex patterns
46/// against source text, collecting byte ranges of matches, and then filtering
47/// any token whose byte range overlaps a match — identical in effect to v4's
48/// `setupIgnorePatterns` which injected Prism grammar tokens.
49#[derive(Debug, Clone)]
50pub struct TokenizeOptions {
51    pub mode: Mode,
52    /// When true, token values are lowercased before hashing.
53    pub ignore_case: bool,
54    /// Ignored byte ranges from `jscpd:ignore-start` / `jscpd:ignore-end`
55    /// and code-level regex matches from `ignorePattern`.
56    /// Each entry is `[start_byte, end_byte)`.
57    pub ignore_ranges: Vec<[usize; 2]>,
58    /// Pre-compiled code-level regex patterns inherited from v4 `ignorePattern`.
59    /// Before tokenization, these are matched against the source text and
60    /// overlapping byte ranges are added to `ignore_ranges`.
61    pub code_ignore_regexes: Vec<regex::Regex>,
62}
63
64impl TokenizeOptions {
65    pub fn new(mode: Mode) -> Self {
66        Self {
67            mode,
68            ignore_case: false,
69            ignore_ranges: Vec::new(),
70            code_ignore_regexes: Vec::new(),
71        }
72    }
73
74    /// Build TokenizeOptions with pre-compiled regex patterns from string patterns.
75    /// Invalid regex patterns are silently skipped.
76    pub fn with_code_ignore_patterns(mode: Mode, patterns: &[String]) -> Self {
77        let code_ignore_regexes: Vec<regex::Regex> = patterns
78            .iter()
79            .filter_map(|p| regex::Regex::new(p).ok())
80            .collect();
81        Self {
82            mode,
83            ignore_case: false,
84            ignore_ranges: Vec::new(),
85            code_ignore_regexes,
86        }
87    }
88}
89
90/// Compute byte ranges of all regex matches against source text.
91/// Used to populate `ignore_ranges` from `ignorePattern` regexes before
92/// tokenization, matching v4 semantics where regex patterns match against
93/// source text regions (not individual token values).
94pub fn code_ignore_ranges(source: &str, regexes: &[regex::Regex]) -> Vec<[usize; 2]> {
95    let mut ranges = Vec::new();
96    for re in regexes {
97        for m in re.find_iter(source) {
98            ranges.push([m.start(), m.end()]);
99        }
100    }
101    ranges
102}
103
104/// Push a token into the detection output if it passes all filters.
105///
106/// Filtering happens here — at tokenize time — so the resulting
107/// `Vec<DetectionToken>` passed to detection is already minimal.
108/// Token values are not stored; only the pre-computed hash is kept.
109///
110/// The argument count is intentional: this function is a hot-path helper
111/// called from every tokenizer branch; grouping parameters into a struct
112/// would add an extra dereference per call.
113#[allow(clippy::too_many_arguments)]
114#[inline]
115pub fn push_token(
116    tokens: &mut Vec<DetectionToken>,
117    kind: TokenKind,
118    value: &str,
119    byte_start: usize,
120    byte_end: usize,
121    start: cpd_core::models::Location,
122    end: cpd_core::models::Location,
123    options: &TokenizeOptions,
124) {
125    // Drop Ignore-marked tokens in all modes.
126    if kind == TokenKind::Ignore {
127        return;
128    }
129    // Drop tokens in Ignore byte ranges.
130    // This covers both jscpd:ignore-start/end markers and code-level ignorePattern
131    // regex ranges (which are computed from source text before tokenization).
132    if options
133        .ignore_ranges
134        .iter()
135        .any(|[rs, re]| byte_start < *re && byte_end > *rs)
136    {
137        return;
138    }
139    // Mode-based filtering:
140    match options.mode {
141        Mode::Mild => {
142            if kind == TokenKind::Whitespace {
143                return;
144            }
145        }
146        Mode::Weak => {
147            if matches!(
148                kind,
149                TokenKind::Whitespace | TokenKind::Comment | TokenKind::BlockComment
150            ) {
151                return;
152            }
153        }
154        Mode::Strict => {} // keep everything (except Ignore, handled above)
155    }
156    tokens.push(DetectionToken {
157        hash: hash_token(kind.discriminant(), value, options.ignore_case),
158        start,
159        end,
160        range: [byte_start, byte_end],
161    });
162}
163
164/// Tokenize source code in the given format with the given mode.
165/// Returns a Vec<Token>. Never panics on empty input — returns empty Vec.
166///
167/// This is the display/reporter path. For the detection path, use
168/// `tokenize_to_detection`.
169pub fn tokenize(format: &str, source: &str, mode: Mode) -> Vec<Token> {
170    let raw = dispatch_tokenizer(format, source, mode);
171    // Apply mode filter inline — keeps Ignore tokens removed, drops Whitespace in
172    // Mild, drops Whitespace+Comment+BlockComment in Weak, keeps all in Strict.
173    raw.into_iter().filter(|t| keep_token(t, mode)).collect()
174}
175
176fn keep_token(token: &Token, mode: Mode) -> bool {
177    if token.kind == TokenKind::Ignore {
178        return false;
179    }
180    match mode {
181        Mode::Mild => !matches!(token.kind, TokenKind::Whitespace),
182        Mode::Weak => !matches!(
183            token.kind,
184            TokenKind::Whitespace | TokenKind::Comment | TokenKind::BlockComment
185        ),
186        Mode::Strict => true,
187    }
188}
189
190/// Tokenize source code for the detection hot path.
191///
192/// Returns `Vec<DetectionToken>` — tokens filtered and hashed inline at
193/// tokenize time. No per-token heap allocation survives in the output:
194/// the value string is consumed; only the hash, locations, and byte range
195/// are stored.
196///
197/// This replaces the `tokenize` → `apply_mode` → convert-to-hashes pipeline
198/// that existed in `detect.rs`.
199pub fn tokenize_to_detection(
200    format: &str,
201    source: &str,
202    options: &TokenizeOptions,
203) -> Vec<DetectionToken> {
204    // Produce the display tokens first (reuse existing tokenizer code),
205    // then convert to DetectionToken in one pass applying options filters.
206    //
207    // This approach is conservative: it reuses all existing tokenizer logic
208    // without risk of introducing per-tokenizer bugs. The conversion is O(n)
209    // and eliminates the separate filter pass and hash computation that
210    // previously happened inside detect.rs.
211    let raw = dispatch_tokenizer(format, source, options.mode);
212    let mut detection = Vec::with_capacity(raw.len());
213    for t in raw {
214        let byte_start = t.start.offset as usize;
215        let byte_end = t.end.offset as usize;
216        push_token(
217            &mut detection,
218            t.kind,
219            &t.value,
220            byte_start,
221            byte_end,
222            t.start,
223            t.end,
224            options,
225        );
226    }
227    detection
228}
229
230fn dispatch_tokenizer(format: &str, source: &str, mode: Mode) -> Vec<Token> {
231    match format {
232        "javascript" | "typescript" | "jsx" | "tsx" => {
233            crate::javascript::tokenize_js(source, format)
234        }
235        "vue" | "svelte" | "astro" => crate::sfc::tokenize_sfc(source, format, mode),
236        "markdown" | "md" => crate::markdown::tokenize_markdown(source, mode),
237        _ => crate::generic::tokenize_generic(source, format),
238    }
239}
240
241/// Tokenize source code into one or more format-specific detection maps.
242///
243/// For single-format files, returns exactly one `TokenMap` with the same format.
244/// For multi-format files (markdown, SFCs), returns one `TokenMap` per detected
245/// sub-language — e.g. markdown prose + embedded JavaScript + embedded Python.
246///
247/// Each map's tokens carry byte offsets relative to the original source, so
248/// they can be used directly for clone detection within their format group.
249pub fn tokenize_to_detection_maps(
250    format: &str,
251    source: &str,
252    options: &TokenizeOptions,
253) -> Vec<TokenMap> {
254    match format {
255        "markdown" | "md" => crate::markdown::tokenize_markdown_maps(source, options),
256        "vue" | "svelte" | "astro" => crate::sfc::tokenize_sfc_maps(source, format, options),
257        _ => {
258            let tokens = tokenize_to_detection(format, source, options);
259            vec![TokenMap {
260                format: format.to_string(),
261                tokens,
262            }]
263        }
264    }
265}
266
267#[cfg(test)]
268mod tests {
269    use super::*;
270
271    #[test]
272    fn mode_from_str_defaults_to_mild() {
273        assert_eq!("unknown".parse::<Mode>().unwrap(), Mode::Mild);
274        assert_eq!("mild".parse::<Mode>().unwrap(), Mode::Mild);
275    }
276
277    #[test]
278    fn mode_from_str_weak() {
279        assert_eq!("weak".parse::<Mode>().unwrap(), Mode::Weak);
280    }
281
282    #[test]
283    fn mode_from_str_strict() {
284        assert_eq!("strict".parse::<Mode>().unwrap(), Mode::Strict);
285    }
286
287    #[test]
288    fn tokenize_to_detection_returns_detection_tokens() {
289        let opts = TokenizeOptions::new(Mode::Mild);
290        let tokens = tokenize_to_detection("javascript", "function hello() { return 42; }", &opts);
291        assert!(
292            !tokens.is_empty(),
293            "must produce DetectionTokens for valid JS"
294        );
295    }
296
297    #[test]
298    fn tokenize_to_detection_mild_excludes_whitespace() {
299        let opts = TokenizeOptions::new(Mode::Mild);
300        // The raw tokenizer produces whitespace tokens; mild mode drops them.
301        // We verify by counting: detection output should have fewer tokens than
302        // a strict-mode tokenize which keeps whitespace.
303        let mild = tokenize_to_detection("javascript", "a b c", &opts);
304        let strict =
305            tokenize_to_detection("javascript", "a b c", &TokenizeOptions::new(Mode::Strict));
306        // Mild must not exceed strict count (whitespace removed).
307        // Note: JS tokenizer doesn't produce Whitespace kind for OXC tokens,
308        // but the contract is that push_token correctly drops them if present.
309        let _ = (mild, strict);
310    }
311
312    #[test]
313    fn push_token_drops_ignore_kind() {
314        let mut tokens = Vec::new();
315        let loc = cpd_core::models::Location {
316            line: 1,
317            column: 0,
318            offset: 0,
319        };
320        let opts = TokenizeOptions::new(Mode::Mild);
321        push_token(
322            &mut tokens,
323            TokenKind::Ignore,
324            "secret",
325            0,
326            6,
327            loc.clone(),
328            loc,
329            &opts,
330        );
331        assert!(tokens.is_empty(), "Ignore-kind tokens must be dropped");
332    }
333
334    #[test]
335    fn push_token_drops_whitespace_in_mild_mode() {
336        let mut tokens = Vec::new();
337        let loc = cpd_core::models::Location {
338            line: 1,
339            column: 0,
340            offset: 0,
341        };
342        let opts = TokenizeOptions::new(Mode::Mild);
343        push_token(
344            &mut tokens,
345            TokenKind::Whitespace,
346            " ",
347            0,
348            1,
349            loc.clone(),
350            loc,
351            &opts,
352        );
353        assert!(tokens.is_empty(), "Whitespace must be dropped in Mild mode");
354    }
355
356    #[test]
357    fn push_token_keeps_whitespace_in_strict_mode() {
358        let mut tokens = Vec::new();
359        let loc = cpd_core::models::Location {
360            line: 1,
361            column: 0,
362            offset: 0,
363        };
364        let opts = TokenizeOptions::new(Mode::Strict);
365        push_token(
366            &mut tokens,
367            TokenKind::Whitespace,
368            " ",
369            0,
370            1,
371            loc.clone(),
372            loc,
373            &opts,
374        );
375        assert_eq!(tokens.len(), 1, "Whitespace must be kept in Strict mode");
376    }
377
378    #[test]
379    fn push_token_drops_comment_in_weak_mode() {
380        let mut tokens = Vec::new();
381        let loc = cpd_core::models::Location {
382            line: 1,
383            column: 0,
384            offset: 0,
385        };
386        let opts = TokenizeOptions::new(Mode::Weak);
387        push_token(
388            &mut tokens,
389            TokenKind::Comment,
390            "// note",
391            0,
392            7,
393            loc.clone(),
394            loc,
395            &opts,
396        );
397        assert!(tokens.is_empty(), "Comment must be dropped in Weak mode");
398    }
399
400    #[test]
401    fn push_token_ignore_case_folds_hash() {
402        let mut t1 = Vec::new();
403        let mut t2 = Vec::new();
404        let loc = cpd_core::models::Location {
405            line: 1,
406            column: 0,
407            offset: 0,
408        };
409        let mut opts = TokenizeOptions::new(Mode::Mild);
410        opts.ignore_case = true;
411        push_token(
412            &mut t1,
413            TokenKind::Identifier,
414            "Hello",
415            0,
416            5,
417            loc.clone(),
418            loc.clone(),
419            &opts,
420        );
421        push_token(
422            &mut t2,
423            TokenKind::Identifier,
424            "hello",
425            0,
426            5,
427            loc.clone(),
428            loc,
429            &opts,
430        );
431        assert_eq!(t1[0].hash, t2[0].hash, "ignore_case must fold case in hash");
432    }
433
434    #[test]
435    fn push_token_code_ignore_range_skips_overlapping_token() {
436        // Simulate: source = "foo// cpd-disable"
437        // regex "//\\s*cpd-disable" matches bytes 3..18
438        // Token "foo" is at 0..3 (no overlap -> kept)
439        // Token "// cpd-disable" is at 3..18 (overlaps -> skipped)
440        let mut tokens = Vec::new();
441        let loc = cpd_core::models::Location {
442            line: 1,
443            column: 0,
444            offset: 0,
445        };
446        let mut opts = TokenizeOptions::new(Mode::Mild);
447        // Pre-computed byte ranges from regex match on source text
448        opts.ignore_ranges = vec![[3, 18]];
449        push_token(
450            &mut tokens,
451            TokenKind::Identifier,
452            "foo",
453            0,
454            3,
455            loc.clone(),
456            loc.clone(),
457            &opts,
458        );
459        push_token(
460            &mut tokens,
461            TokenKind::Comment,
462            "// cpd-disable",
463            3,
464            18,
465            loc.clone(),
466            loc,
467            &opts,
468        );
469        assert_eq!(tokens.len(), 1, "only the non-matching token should remain");
470        assert_eq!(tokens[0].range, [0, 3]);
471    }
472
473    #[test]
474    fn push_token_code_ignore_range_no_overlap_keeps_all() {
475        // regex match at bytes 100..120 doesn't overlap tokens at 0..3, 3..6
476        let mut tokens = Vec::new();
477        let loc = cpd_core::models::Location {
478            line: 1,
479            column: 0,
480            offset: 0,
481        };
482        let mut opts = TokenizeOptions::new(Mode::Mild);
483        opts.ignore_ranges = vec![[100, 120]];
484        push_token(
485            &mut tokens,
486            TokenKind::Identifier,
487            "foo",
488            0,
489            3,
490            loc.clone(),
491            loc.clone(),
492            &opts,
493        );
494        push_token(
495            &mut tokens,
496            TokenKind::Identifier,
497            "bar",
498            3,
499            6,
500            loc.clone(),
501            loc,
502            &opts,
503        );
504        assert_eq!(
505            tokens.len(),
506            2,
507            "both tokens should remain when range doesn't overlap"
508        );
509    }
510
511    #[test]
512    fn code_ignore_ranges_computes_from_source_text() {
513        let source = "import foo from 'bar';\nconst x = 1;";
514        let re = regex::Regex::new(r"import\s+\w+\s+from").unwrap();
515        let ranges = code_ignore_ranges(source, &[re]);
516        assert_eq!(ranges.len(), 1, "should find one regex match");
517        // "import foo from" starts at byte 0, ends at byte 15
518        assert_eq!(ranges[0], [0, 15]);
519    }
520
521    #[test]
522    fn code_ignore_ranges_multiple_patterns() {
523        let source = "// MIT License\nfunction foo() {}\n// Copyright";
524        let re1 = regex::Regex::new(r"//\s*MIT\s+License").unwrap();
525        let re2 = regex::Regex::new(r"//\s*Copyright").unwrap();
526        let ranges = code_ignore_ranges(source, &[re1, re2]);
527        assert_eq!(ranges.len(), 2, "should find two regex matches");
528    }
529
530    #[test]
531    fn code_ignore_ranges_empty_regexes() {
532        let source = "function foo() {}";
533        let ranges = code_ignore_ranges(source, &[]);
534        assert!(ranges.is_empty(), "no regexes means no ranges");
535    }
536
537    #[test]
538    fn with_code_ignore_patterns_builds_regexes() {
539        let opts = TokenizeOptions::with_code_ignore_patterns(
540            Mode::Mild,
541            &vec!["function".to_string(), r"//\s*cpd-disable".to_string()],
542        );
543        assert_eq!(opts.code_ignore_regexes.len(), 2);
544        assert!(opts.code_ignore_regexes[0].is_match("function"));
545        assert!(opts.code_ignore_regexes[1].is_match("// cpd-disable"));
546        assert!(!opts.code_ignore_regexes[1].is_match("function"));
547    }
548
549    #[test]
550    fn tokenize_to_detection_with_code_ignore_ranges_skips_imports() {
551        let source = "import * from 'lodash';\nconst x = 1;";
552        let regexes = vec![regex::Regex::new(r"import\s+\*\s+from").unwrap()];
553        let ranges = code_ignore_ranges(source, &regexes);
554        assert!(!ranges.is_empty(), "should find regex match in source");
555
556        let mut opts = TokenizeOptions::new(Mode::Mild);
557        opts.ignore_ranges = ranges;
558        let tokens = tokenize_to_detection("javascript", source, &opts);
559
560        // Tokens whose byte ranges overlap the import match should be skipped.
561        // "import" (0-6), "*" (7-8), "from" (9-13) should all be in range,
562        // but "const" (24-29) and "x" (30-31) etc should remain.
563        let has_const = tokens.iter().any(|t| {
564            // Check that tokens after the import line are still present
565            t.range[0] >= 24
566        });
567        assert!(
568            has_const,
569            "tokens after the import line should still be present"
570        );
571    }
572
573    #[test]
574    fn code_ignore_ranges_multi_token_match() {
575        // The key test: regex "import.*from" matches multi-token source text
576        // like "import * from 'module-name'" — not just a single token value.
577        let source = "import * from 'lodash';\nconst result = 42;";
578        let re = regex::Regex::new(r"import\s+.*?\s+from").unwrap();
579        let ranges = code_ignore_ranges(source, &[re]);
580        assert_eq!(
581            ranges.len(),
582            1,
583            "should find one regex match spanning import statement"
584        );
585        assert!(ranges[0][0] == 0, "match should start at beginning");
586        assert!(ranges[0][1] > 0, "match should have non-zero end");
587    }
588}
cpd_tokenizer/tokenizer.rs

cpd_tokenizer/
tokenizer.rs