Skip to main content

cpd_tokenizer/
tokenizer.rs

1use std::str::FromStr;
2
3use cpd_core::hash::hash_token;
4use cpd_core::models::{DetectionToken, Token, TokenKind};
5
6/// A sub-format detection map produced by multi-format tokenizers.
7///
8/// For single-format files, `tokenize_to_detection_maps()` returns exactly one
9/// TokenMap with the same format as the file.
10///
11/// For multi-format files (markdown, SFC), one TokenMap is returned per
12/// detected sub-language, each carrying tokens that should enter that
13/// format's detection pool.
14#[derive(Debug, Clone)]
15pub struct TokenMap {
16    pub format: String,
17    pub tokens: Vec<DetectionToken>,
18}
19
20#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
21pub enum Mode {
22    #[default]
23    Mild,
24    Weak,
25    Strict,
26}
27
28impl FromStr for Mode {
29    type Err = ();
30
31    fn from_str(s: &str) -> Result<Self, Self::Err> {
32        match s {
33            "weak" => Ok(Self::Weak),
34            "strict" => Ok(Self::Strict),
35            _ => Ok(Self::Mild),
36        }
37    }
38}
39
40/// Options for the detection-path tokenizer.
41///
42/// Carries mode, case-folding flag, and pre-parsed ignore-region byte ranges.
43/// These are applied inside `push_token` — tokens filtered here are never stored.
44#[derive(Debug, Clone)]
45pub struct TokenizeOptions {
46    pub mode: Mode,
47    /// When true, token values are lowercased before hashing.
48    pub ignore_case: bool,
49    /// Ignored byte ranges from `jscpd:ignore-start` / `jscpd:ignore-end`.
50    /// Each entry is `[start_byte, end_byte)`.
51    pub ignore_ranges: Vec<[usize; 2]>,
52}
53
54impl TokenizeOptions {
55    pub fn new(mode: Mode) -> Self {
56        Self {
57            mode,
58            ignore_case: false,
59            ignore_ranges: Vec::new(),
60        }
61    }
62}
63
64/// Push a token into the detection output if it passes all filters.
65///
66/// Filtering happens here — at tokenize time — so the resulting
67/// `Vec<DetectionToken>` passed to detection is already minimal.
68/// Token values are not stored; only the pre-computed hash is kept.
69///
70/// The argument count is intentional: this function is a hot-path helper
71/// called from every tokenizer branch; grouping parameters into a struct
72/// would add an extra dereference per call.
73#[allow(clippy::too_many_arguments)]
74#[inline]
75pub fn push_token(
76    tokens: &mut Vec<DetectionToken>,
77    kind: TokenKind,
78    value: &str,
79    byte_start: usize,
80    byte_end: usize,
81    start: cpd_core::models::Location,
82    end: cpd_core::models::Location,
83    options: &TokenizeOptions,
84) {
85    // Drop Ignore-marked tokens in all modes.
86    if kind == TokenKind::Ignore {
87        return;
88    }
89    // Drop tokens in Ignore byte ranges.
90    if options
91        .ignore_ranges
92        .iter()
93        .any(|[rs, re]| byte_start < *re && byte_end > *rs)
94    {
95        return;
96    }
97    // Mode-based filtering:
98    match options.mode {
99        Mode::Mild => {
100            if kind == TokenKind::Whitespace {
101                return;
102            }
103        }
104        Mode::Weak => {
105            if matches!(
106                kind,
107                TokenKind::Whitespace | TokenKind::Comment | TokenKind::BlockComment
108            ) {
109                return;
110            }
111        }
112        Mode::Strict => {} // keep everything (except Ignore, handled above)
113    }
114    tokens.push(DetectionToken {
115        hash: hash_token(kind.discriminant(), value, options.ignore_case),
116        start,
117        end,
118        range: [byte_start, byte_end],
119    });
120}
121
122/// Tokenize source code in the given format with the given mode.
123/// Returns a Vec<Token>. Never panics on empty input — returns empty Vec.
124///
125/// This is the display/reporter path. For the detection path, use
126/// `tokenize_to_detection`.
127pub fn tokenize(format: &str, source: &str, mode: Mode) -> Vec<Token> {
128    let raw = dispatch_tokenizer(format, source, mode);
129    // Apply mode filter inline — keeps Ignore tokens removed, drops Whitespace in
130    // Mild, drops Whitespace+Comment+BlockComment in Weak, keeps all in Strict.
131    raw.into_iter().filter(|t| keep_token(t, mode)).collect()
132}
133
134fn keep_token(token: &Token, mode: Mode) -> bool {
135    if token.kind == TokenKind::Ignore {
136        return false;
137    }
138    match mode {
139        Mode::Mild => !matches!(token.kind, TokenKind::Whitespace),
140        Mode::Weak => !matches!(
141            token.kind,
142            TokenKind::Whitespace | TokenKind::Comment | TokenKind::BlockComment
143        ),
144        Mode::Strict => true,
145    }
146}
147
148/// Tokenize source code for the detection hot path.
149///
150/// Returns `Vec<DetectionToken>` — tokens filtered and hashed inline at
151/// tokenize time. No per-token heap allocation survives in the output:
152/// the value string is consumed; only the hash, locations, and byte range
153/// are stored.
154///
155/// This replaces the `tokenize` → `apply_mode` → convert-to-hashes pipeline
156/// that existed in `detect.rs`.
157pub fn tokenize_to_detection(
158    format: &str,
159    source: &str,
160    options: &TokenizeOptions,
161) -> Vec<DetectionToken> {
162    // Produce the display tokens first (reuse existing tokenizer code),
163    // then convert to DetectionToken in one pass applying options filters.
164    //
165    // This approach is conservative: it reuses all existing tokenizer logic
166    // without risk of introducing per-tokenizer bugs. The conversion is O(n)
167    // and eliminates the separate filter pass and hash computation that
168    // previously happened inside detect.rs.
169    let raw = dispatch_tokenizer(format, source, options.mode);
170    let mut detection = Vec::with_capacity(raw.len());
171    for t in raw {
172        let byte_start = t.start.offset as usize;
173        let byte_end = t.end.offset as usize;
174        push_token(
175            &mut detection,
176            t.kind,
177            &t.value,
178            byte_start,
179            byte_end,
180            t.start,
181            t.end,
182            options,
183        );
184    }
185    detection
186}
187
188fn dispatch_tokenizer(format: &str, source: &str, mode: Mode) -> Vec<Token> {
189    match format {
190        "javascript" | "typescript" | "jsx" | "tsx" => {
191            crate::javascript::tokenize_js(source, format)
192        }
193        "vue" | "svelte" | "astro" => crate::sfc::tokenize_sfc(source, format, mode),
194        "markdown" | "md" => crate::markdown::tokenize_markdown(source, mode),
195        _ => crate::generic::tokenize_generic(source, format),
196    }
197}
198
199/// Tokenize source code into one or more format-specific detection maps.
200///
201/// For single-format files, returns exactly one `TokenMap` with the same format.
202/// For multi-format files (markdown, SFCs), returns one `TokenMap` per detected
203/// sub-language — e.g. markdown prose + embedded JavaScript + embedded Python.
204///
205/// Each map's tokens carry byte offsets relative to the original source, so
206/// they can be used directly for clone detection within their format group.
207pub fn tokenize_to_detection_maps(
208    format: &str,
209    source: &str,
210    options: &TokenizeOptions,
211) -> Vec<TokenMap> {
212    match format {
213        "markdown" | "md" => crate::markdown::tokenize_markdown_maps(source, options),
214        "vue" | "svelte" | "astro" => crate::sfc::tokenize_sfc_maps(source, format, options),
215        _ => {
216            let tokens = tokenize_to_detection(format, source, options);
217            vec![TokenMap {
218                format: format.to_string(),
219                tokens,
220            }]
221        }
222    }
223}
224
225#[cfg(test)]
226mod tests {
227    use super::*;
228
229    #[test]
230    fn mode_from_str_defaults_to_mild() {
231        assert_eq!("unknown".parse::<Mode>().unwrap(), Mode::Mild);
232        assert_eq!("mild".parse::<Mode>().unwrap(), Mode::Mild);
233    }
234
235    #[test]
236    fn mode_from_str_weak() {
237        assert_eq!("weak".parse::<Mode>().unwrap(), Mode::Weak);
238    }
239
240    #[test]
241    fn mode_from_str_strict() {
242        assert_eq!("strict".parse::<Mode>().unwrap(), Mode::Strict);
243    }
244
245    #[test]
246    fn tokenize_to_detection_returns_detection_tokens() {
247        let opts = TokenizeOptions::new(Mode::Mild);
248        let tokens = tokenize_to_detection("javascript", "function hello() { return 42; }", &opts);
249        assert!(
250            !tokens.is_empty(),
251            "must produce DetectionTokens for valid JS"
252        );
253    }
254
255    #[test]
256    fn tokenize_to_detection_mild_excludes_whitespace() {
257        let opts = TokenizeOptions::new(Mode::Mild);
258        // The raw tokenizer produces whitespace tokens; mild mode drops them.
259        // We verify by counting: detection output should have fewer tokens than
260        // a strict-mode tokenize which keeps whitespace.
261        let mild = tokenize_to_detection("javascript", "a b c", &opts);
262        let strict =
263            tokenize_to_detection("javascript", "a b c", &TokenizeOptions::new(Mode::Strict));
264        // Mild must not exceed strict count (whitespace removed).
265        // Note: JS tokenizer doesn't produce Whitespace kind for OXC tokens,
266        // but the contract is that push_token correctly drops them if present.
267        let _ = (mild, strict);
268    }
269
270    #[test]
271    fn push_token_drops_ignore_kind() {
272        let mut tokens = Vec::new();
273        let loc = cpd_core::models::Location {
274            line: 1,
275            column: 0,
276            offset: 0,
277        };
278        let opts = TokenizeOptions::new(Mode::Mild);
279        push_token(
280            &mut tokens,
281            TokenKind::Ignore,
282            "secret",
283            0,
284            6,
285            loc.clone(),
286            loc,
287            &opts,
288        );
289        assert!(tokens.is_empty(), "Ignore-kind tokens must be dropped");
290    }
291
292    #[test]
293    fn push_token_drops_whitespace_in_mild_mode() {
294        let mut tokens = Vec::new();
295        let loc = cpd_core::models::Location {
296            line: 1,
297            column: 0,
298            offset: 0,
299        };
300        let opts = TokenizeOptions::new(Mode::Mild);
301        push_token(
302            &mut tokens,
303            TokenKind::Whitespace,
304            " ",
305            0,
306            1,
307            loc.clone(),
308            loc,
309            &opts,
310        );
311        assert!(tokens.is_empty(), "Whitespace must be dropped in Mild mode");
312    }
313
314    #[test]
315    fn push_token_keeps_whitespace_in_strict_mode() {
316        let mut tokens = Vec::new();
317        let loc = cpd_core::models::Location {
318            line: 1,
319            column: 0,
320            offset: 0,
321        };
322        let opts = TokenizeOptions::new(Mode::Strict);
323        push_token(
324            &mut tokens,
325            TokenKind::Whitespace,
326            " ",
327            0,
328            1,
329            loc.clone(),
330            loc,
331            &opts,
332        );
333        assert_eq!(tokens.len(), 1, "Whitespace must be kept in Strict mode");
334    }
335
336    #[test]
337    fn push_token_drops_comment_in_weak_mode() {
338        let mut tokens = Vec::new();
339        let loc = cpd_core::models::Location {
340            line: 1,
341            column: 0,
342            offset: 0,
343        };
344        let opts = TokenizeOptions::new(Mode::Weak);
345        push_token(
346            &mut tokens,
347            TokenKind::Comment,
348            "// note",
349            0,
350            7,
351            loc.clone(),
352            loc,
353            &opts,
354        );
355        assert!(tokens.is_empty(), "Comment must be dropped in Weak mode");
356    }
357
358    #[test]
359    fn push_token_ignore_case_folds_hash() {
360        let mut t1 = Vec::new();
361        let mut t2 = Vec::new();
362        let loc = cpd_core::models::Location {
363            line: 1,
364            column: 0,
365            offset: 0,
366        };
367        let mut opts = TokenizeOptions::new(Mode::Mild);
368        opts.ignore_case = true;
369        push_token(
370            &mut t1,
371            TokenKind::Identifier,
372            "Hello",
373            0,
374            5,
375            loc.clone(),
376            loc.clone(),
377            &opts,
378        );
379        push_token(
380            &mut t2,
381            TokenKind::Identifier,
382            "hello",
383            0,
384            5,
385            loc.clone(),
386            loc,
387            &opts,
388        );
389        assert_eq!(t1[0].hash, t2[0].hash, "ignore_case must fold case in hash");
390    }
391}