cpd_tokenizer/
generic.rs

1// cpd-tokenizer: generic whitespace-and-punctuation tokenizer for non-JS/TS formats.
2// Handles comment styles, ignore regions, and per-line token scanning without regex.
3
4use cpd_core::models::{Location, Token, TokenKind};
5
6#[derive(Debug, Clone, Copy, PartialEq, Eq)]
7enum CommentStyle {
8    /// Single-line `//`, block `/* */`
9    CStyle,
10    /// Single-line `#`
11    Hash,
12    /// Single-line `--`
13    DoubleDash,
14    /// Single-line `--`, block `--[[ ]]`
15    Lua,
16    /// Single-line `;`
17    Semicolon,
18    /// Single-line `'`
19    VisualBasic,
20    /// No comments (fallback for unrecognised formats)
21    #[allow(dead_code)]
22    None,
23}
24
25fn comment_style(format: &str) -> CommentStyle {
26    match format {
27        "c" | "c-header" | "cpp" | "cpp-header" | "csharp" | "java" | "go" | "rust" | "swift"
28        | "kotlin" | "scala" | "dart" | "php" | "typescript" | "jsx" | "tsx" | "javascript"
29        | "groovy" | "d" | "glsl" | "hlsl" | "wgsl" | "openqasm" | "solidity" | "bicep" | "hcl"
30        | "json5" | "less" | "scss" | "css" | "objectivec" | "protobuf" | "apex" | "verilog"
31        | "zig" | "odin" | "fsharp" | "actionscript" | "cfscript" => CommentStyle::CStyle,
32
33        "python" | "ruby" | "perl" | "bash" | "sh" | "zsh" | "fish" | "r" | "julia" | "yaml"
34        | "toml" | "dockerfile" | "makefile" | "cmake" | "coffeescript" | "crystal" | "nim"
35        | "gdscript" | "elixir" | "awk" | "tcl" | "powershell" | "puppet" | "ignore" => {
36            CommentStyle::Hash
37        }
38
39        "sql" | "haskell" | "elm" | "ada" | "plsql" => CommentStyle::DoubleDash,
40
41        "lua" => CommentStyle::Lua,
42
43        "ini" | "properties" | "asm6502" | "nasm" => CommentStyle::Semicolon,
44
45        "vb" | "vbs" | "basic" | "vbnet" | "visual-basic" => CommentStyle::VisualBasic,
46
47        _ => CommentStyle::CStyle,
48    }
49}
50
51fn is_ignore_start(text: &str) -> bool {
52    text.contains("jscpd:ignore-start")
53}
54
55fn is_ignore_end(text: &str) -> bool {
56    text.contains("jscpd:ignore-end")
57}
58
59fn make_token(kind: TokenKind, value: &str, line: u32, col: u32, offset: u32) -> Token {
60    let len = value.len() as u32;
61    Token {
62        kind,
63        value: value.to_string(),
64        start: Location {
65            line,
66            column: col,
67            offset,
68        },
69        end: Location {
70            line,
71            column: col + len,
72            offset: offset + len,
73        },
74    }
75}
76
77fn classify_word(word: &str) -> TokenKind {
78    if word.chars().all(|c| c.is_ascii_digit()) {
79        return TokenKind::Literal;
80    }
81    if word.chars().all(|c| c.is_ascii_punctuation()) {
82        return TokenKind::Punctuation;
83    }
84    TokenKind::Identifier
85}
86
87fn tokenize_line_content(
88    line: &str,
89    line_num: u32,
90    line_offset: u32,
91    style: CommentStyle,
92    in_ignore: bool,
93    in_block_comment: &mut bool,
94) -> Vec<Token> {
95    let mut tokens = Vec::new();
96
97    // Collect (byte_offset, char) pairs once — zero heap allocation vs chars().collect().
98    // `char_indices()` returns (byte_index, char) which gives us correct UTF-8 byte offsets
99    // for column accounting while avoiding a Vec<char> heap allocation per line.
100    let chars: Vec<(usize, char)> = line.char_indices().collect();
101    let n = chars.len();
102    let mut i = 0usize;
103
104    // col is in bytes (UTF-8 units), consistent with char.len_utf8() increments below.
105    let mut col = 0u32;
106
107    macro_rules! offset {
108        () => {
109            line_offset + col
110        };
111    }
112
113    while i < n {
114        let (_, ch) = chars[i];
115
116        // Handle block comment end
117        if *in_block_comment {
118            if matches!(style, CommentStyle::CStyle)
119                && i + 1 < n
120                && ch == '*'
121                && chars[i + 1].1 == '/'
122            {
123                let start_col = col;
124                let start_off = offset!();
125                col += 2;
126                i += 2;
127                let kind = if in_ignore {
128                    TokenKind::Ignore
129                } else {
130                    TokenKind::Comment
131                };
132                tokens.push(make_token(kind, "*/", line_num, start_col, start_off));
133                *in_block_comment = false;
134                continue;
135            }
136            // Still inside block comment — consume char
137            let start_col = col;
138            let start_off = offset!();
139            let mut s = String::new();
140            s.push(ch);
141            col += ch.len_utf8() as u32;
142            i += 1;
143            let kind = if in_ignore {
144                TokenKind::Ignore
145            } else {
146                TokenKind::Comment
147            };
148            tokens.push(make_token(kind, &s, line_num, start_col, start_off));
149            continue;
150        }
151
152        // Lua long block comment --[[
153        if matches!(style, CommentStyle::Lua)
154            && i + 3 < n
155            && ch == '-'
156            && chars[i + 1].1 == '-'
157            && chars[i + 2].1 == '['
158            && chars[i + 3].1 == '['
159        {
160            let rest = &line[chars[i].0..];
161            let kind = if in_ignore {
162                TokenKind::Ignore
163            } else {
164                TokenKind::Comment
165            };
166            tokens.push(make_token(kind, rest, line_num, col, offset!()));
167            break;
168        }
169
170        // C-style block comment open /*
171        if matches!(style, CommentStyle::CStyle) && i + 1 < n && ch == '/' && chars[i + 1].1 == '*'
172        {
173            *in_block_comment = true;
174            let start_col = col;
175            let start_off = offset!();
176            col += 2;
177            i += 2;
178            let kind = if in_ignore {
179                TokenKind::Ignore
180            } else {
181                TokenKind::Comment
182            };
183            tokens.push(make_token(kind, "/*", line_num, start_col, start_off));
184            continue;
185        }
186
187        // Line comment — check current position directly without allocating
188        let is_comment = match style {
189            CommentStyle::CStyle => i + 1 < n && ch == '/' && chars[i + 1].1 == '/',
190            CommentStyle::Hash => ch == '#',
191            CommentStyle::DoubleDash | CommentStyle::Lua => {
192                i + 1 < n && ch == '-' && chars[i + 1].1 == '-'
193            }
194            CommentStyle::Semicolon => ch == ';',
195            CommentStyle::VisualBasic => ch == '\'',
196            CommentStyle::None => false,
197        };
198
199        if is_comment {
200            let rest = &line[chars[i].0..];
201            let kind = if in_ignore {
202                TokenKind::Ignore
203            } else {
204                TokenKind::Comment
205            };
206            tokens.push(make_token(kind, rest, line_num, col, offset!()));
207            break;
208        }
209
210        // String literals (double-quote or single-quote)
211        if ch == '"' || ch == '\'' {
212            let quote = ch;
213            let start_col = col;
214            let start_off = offset!();
215            let mut j = chars[i].0; // byte start of string in `line`
216            let str_start = j;
217            col += 1;
218            i += 1;
219            j += 1;
220            while i < n && chars[i].1 != quote {
221                if chars[i].1 == '\\' && i + 1 < n {
222                    col += chars[i].1.len_utf8() as u32 + chars[i + 1].1.len_utf8() as u32;
223                    i += 2;
224                } else {
225                    col += chars[i].1.len_utf8() as u32;
226                    i += 1;
227                }
228            }
229            if i < n {
230                col += 1;
231                i += 1;
232            }
233            let str_end = if i < n {
234                chars[i - 1].0 + chars[i - 1].1.len_utf8()
235            } else {
236                line.len()
237            };
238            let _ = (j, str_start); // byte indices computed above but using slice below
239            let s = &line[str_start..str_end];
240            let kind = if in_ignore {
241                TokenKind::Ignore
242            } else {
243                TokenKind::Literal
244            };
245            tokens.push(make_token(kind, s, line_num, start_col, start_off));
246            continue;
247        }
248
249        // Whitespace
250        if ch.is_whitespace() {
251            let start_col = col;
252            let start_off = offset!();
253            let byte_start = chars[i].0;
254            while i < n && chars[i].1.is_whitespace() {
255                col += chars[i].1.len_utf8() as u32;
256                i += 1;
257            }
258            let byte_end = if i < n { chars[i].0 } else { line.len() };
259            let kind = if in_ignore {
260                TokenKind::Ignore
261            } else {
262                TokenKind::Whitespace
263            };
264            tokens.push(make_token(
265                kind,
266                &line[byte_start..byte_end],
267                line_num,
268                start_col,
269                start_off,
270            ));
271            continue;
272        }
273
274        // Numbers
275        if ch.is_ascii_digit() {
276            let start_col = col;
277            let start_off = offset!();
278            let byte_start = chars[i].0;
279            while i < n && (chars[i].1.is_ascii_digit() || chars[i].1 == '.') {
280                col += 1;
281                i += 1;
282            }
283            let byte_end = if i < n { chars[i].0 } else { line.len() };
284            let kind = if in_ignore {
285                TokenKind::Ignore
286            } else {
287                TokenKind::Literal
288            };
289            tokens.push(make_token(
290                kind,
291                &line[byte_start..byte_end],
292                line_num,
293                start_col,
294                start_off,
295            ));
296            continue;
297        }
298
299        // Identifiers / keywords
300        if ch.is_alphabetic() || ch == '_' {
301            let start_col = col;
302            let start_off = offset!();
303            let byte_start = chars[i].0;
304            while i < n && (chars[i].1.is_alphanumeric() || chars[i].1 == '_') {
305                col += chars[i].1.len_utf8() as u32;
306                i += 1;
307            }
308            let byte_end = if i < n { chars[i].0 } else { line.len() };
309            let s = &line[byte_start..byte_end];
310            let kind = if in_ignore {
311                TokenKind::Ignore
312            } else {
313                classify_word(s)
314            };
315            tokens.push(make_token(kind, s, line_num, start_col, start_off));
316            continue;
317        }
318
319        // Operators / punctuation (single char)
320        let start_col = col;
321        let start_off = offset!();
322        let byte_start = chars[i].0;
323        col += ch.len_utf8() as u32;
324        i += 1;
325        let byte_end = if i < n { chars[i].0 } else { line.len() };
326        let kind = if in_ignore {
327            TokenKind::Ignore
328        } else {
329            TokenKind::Punctuation
330        };
331        tokens.push(make_token(
332            kind,
333            &line[byte_start..byte_end],
334            line_num,
335            start_col,
336            start_off,
337        ));
338    }
339
340    tokens
341}
342
343/// Tokenize source in the given format. Never panics on empty input.
344pub fn tokenize_generic(source: &str, format: &str) -> Vec<Token> {
345    if source.is_empty() {
346        return Vec::new();
347    }
348
349    let style = comment_style(format);
350    let mut tokens = Vec::new();
351    let mut in_ignore = false;
352    let mut in_block_comment = false;
353    let mut offset = 0u32;
354
355    for (line_idx, line) in source.lines().enumerate() {
356        let line_num = line_idx as u32 + 1;
357        let trimmed = line.trim();
358
359        if is_ignore_start(trimmed) {
360            in_ignore = true;
361        }
362        if is_ignore_end(trimmed) {
363            in_ignore = false;
364            // Advance offset past this line and continue
365            offset += line.len() as u32 + 1;
366            continue;
367        }
368
369        let line_tokens = tokenize_line_content(
370            line,
371            line_num,
372            offset,
373            style,
374            in_ignore,
375            &mut in_block_comment,
376        );
377        tokens.extend(line_tokens);
378        offset += line.len() as u32 + 1;
379    }
380
381    tokens
382}
383
384#[cfg(test)]
385mod tests {
386    use super::*;
387
388    #[test]
389    fn python_produces_tokens() {
390        let tokens = tokenize_generic("def hello():\n    return 42\n", "python");
391        assert!(!tokens.is_empty());
392    }
393
394    #[test]
395    fn python_hash_comment_marked_as_comment() {
396        let tokens = tokenize_generic("# this is a comment\nx = 1\n", "python");
397        let has_comment = tokens.iter().any(|t| t.kind == TokenKind::Comment);
398        assert!(has_comment, "Python # comments must be Comment kind");
399    }
400
401    #[test]
402    fn go_c_style_comment_recognized() {
403        let tokens = tokenize_generic("// hello\nfunc main() {}\n", "go");
404        let has_comment = tokens.iter().any(|t| t.kind == TokenKind::Comment);
405        assert!(has_comment);
406    }
407
408    #[test]
409    fn empty_input_returns_empty() {
410        let tokens = tokenize_generic("", "python");
411        assert!(
412            tokens.is_empty(),
413            "empty input must return empty vec, not panic"
414        );
415    }
416
417    #[test]
418    fn unknown_format_does_not_panic() {
419        let result =
420            std::panic::catch_unwind(|| tokenize_generic("hello world", "unknown_format_xyz"));
421        assert!(result.is_ok());
422    }
423
424    #[test]
425    fn ignore_region_tokens_marked_as_ignore() {
426        let source = "x = 1\n# jscpd:ignore-start\ny = 2\n# jscpd:ignore-end\nz = 3\n";
427        let tokens = tokenize_generic(source, "python");
428        let has_ignore = tokens.iter().any(|t| t.kind == TokenKind::Ignore);
429        assert!(has_ignore, "tokens in ignore region must be Ignore kind");
430    }
431
432    #[test]
433    fn sql_double_dash_comment_recognized() {
434        let tokens = tokenize_generic("-- a comment\nSELECT * FROM foo;\n", "sql");
435        let has_comment = tokens.iter().any(|t| t.kind == TokenKind::Comment);
436        assert!(has_comment);
437    }
438
439    #[test]
440    fn c_block_comment_recognized() {
441        let tokens = tokenize_generic("/* block */\nint x = 1;\n", "c");
442        let has_comment = tokens.iter().any(|t| t.kind == TokenKind::Comment);
443        assert!(has_comment);
444    }
445
446    #[test]
447    fn location_line_numbers_are_1_based() {
448        let tokens = tokenize_generic("x = 1\ny = 2\n", "python");
449        let first = tokens.first().expect("at least one token");
450        assert_eq!(first.start.line, 1);
451    }
452}
cpd_tokenizer/generic.rs

cpd_tokenizer/
generic.rs