1use cpd_core::models::{Location, Token, TokenKind};
5
6#[derive(Debug, Clone, Copy, PartialEq, Eq)]
7enum CommentStyle {
8 CStyle,
10 Hash,
12 DoubleDash,
14 Lua,
16 Semicolon,
18 VisualBasic,
20 #[allow(dead_code)]
22 None,
23}
24
25fn comment_style(format: &str) -> CommentStyle {
26 match format {
27 "c" | "c-header" | "cpp" | "cpp-header" | "csharp" | "java" | "go" | "rust" | "swift"
28 | "kotlin" | "scala" | "dart" | "php" | "typescript" | "jsx" | "tsx" | "javascript"
29 | "groovy" | "d" | "glsl" | "hlsl" | "wgsl" | "openqasm" | "solidity" | "bicep" | "hcl"
30 | "json5" | "less" | "scss" | "css" | "objectivec" | "protobuf" | "apex" | "verilog"
31 | "zig" | "odin" | "fsharp" | "actionscript" | "cfscript" => CommentStyle::CStyle,
32
33 "python" | "ruby" | "perl" | "bash" | "sh" | "zsh" | "fish" | "r" | "julia" | "yaml"
34 | "toml" | "dockerfile" | "makefile" | "cmake" | "coffeescript" | "crystal" | "nim"
35 | "gdscript" | "elixir" | "awk" | "tcl" | "powershell" | "puppet" | "ignore" => {
36 CommentStyle::Hash
37 }
38
39 "sql" | "haskell" | "elm" | "ada" | "plsql" => CommentStyle::DoubleDash,
40
41 "lua" => CommentStyle::Lua,
42
43 "ini" | "properties" | "asm6502" | "nasm" => CommentStyle::Semicolon,
44
45 "vb" | "vbs" | "basic" | "vbnet" | "visual-basic" => CommentStyle::VisualBasic,
46
47 _ => CommentStyle::CStyle,
48 }
49}
50
51fn is_ignore_start(text: &str) -> bool {
52 text.contains("jscpd:ignore-start")
53}
54
55fn is_ignore_end(text: &str) -> bool {
56 text.contains("jscpd:ignore-end")
57}
58
59fn make_token(kind: TokenKind, value: &str, line: u32, col: u32, offset: u32) -> Token {
60 let len = value.len() as u32;
61 Token {
62 kind,
63 value: value.to_string(),
64 start: Location {
65 line,
66 column: col,
67 offset,
68 },
69 end: Location {
70 line,
71 column: col + len,
72 offset: offset + len,
73 },
74 }
75}
76
77fn classify_word(word: &str) -> TokenKind {
78 if word.chars().all(|c| c.is_ascii_digit()) {
79 return TokenKind::Literal;
80 }
81 if word.chars().all(|c| c.is_ascii_punctuation()) {
82 return TokenKind::Punctuation;
83 }
84 TokenKind::Identifier
85}
86
87fn tokenize_line_content(
88 line: &str,
89 line_num: u32,
90 line_offset: u32,
91 style: CommentStyle,
92 in_ignore: bool,
93 in_block_comment: &mut bool,
94) -> Vec<Token> {
95 let mut tokens = Vec::new();
96
97 let chars: Vec<(usize, char)> = line.char_indices().collect();
101 let n = chars.len();
102 let mut i = 0usize;
103
104 let mut col = 0u32;
106
107 macro_rules! offset {
108 () => {
109 line_offset + col
110 };
111 }
112
113 while i < n {
114 let (_, ch) = chars[i];
115
116 if *in_block_comment {
118 if matches!(style, CommentStyle::CStyle)
119 && i + 1 < n
120 && ch == '*'
121 && chars[i + 1].1 == '/'
122 {
123 let start_col = col;
124 let start_off = offset!();
125 col += 2;
126 i += 2;
127 let kind = if in_ignore {
128 TokenKind::Ignore
129 } else {
130 TokenKind::Comment
131 };
132 tokens.push(make_token(kind, "*/", line_num, start_col, start_off));
133 *in_block_comment = false;
134 continue;
135 }
136 let start_col = col;
138 let start_off = offset!();
139 let mut s = String::new();
140 s.push(ch);
141 col += ch.len_utf8() as u32;
142 i += 1;
143 let kind = if in_ignore {
144 TokenKind::Ignore
145 } else {
146 TokenKind::Comment
147 };
148 tokens.push(make_token(kind, &s, line_num, start_col, start_off));
149 continue;
150 }
151
152 if matches!(style, CommentStyle::Lua)
154 && i + 3 < n
155 && ch == '-'
156 && chars[i + 1].1 == '-'
157 && chars[i + 2].1 == '['
158 && chars[i + 3].1 == '['
159 {
160 let rest = &line[chars[i].0..];
161 let kind = if in_ignore {
162 TokenKind::Ignore
163 } else {
164 TokenKind::Comment
165 };
166 tokens.push(make_token(kind, rest, line_num, col, offset!()));
167 break;
168 }
169
170 if matches!(style, CommentStyle::CStyle) && i + 1 < n && ch == '/' && chars[i + 1].1 == '*'
172 {
173 *in_block_comment = true;
174 let start_col = col;
175 let start_off = offset!();
176 col += 2;
177 i += 2;
178 let kind = if in_ignore {
179 TokenKind::Ignore
180 } else {
181 TokenKind::Comment
182 };
183 tokens.push(make_token(kind, "/*", line_num, start_col, start_off));
184 continue;
185 }
186
187 let is_comment = match style {
189 CommentStyle::CStyle => i + 1 < n && ch == '/' && chars[i + 1].1 == '/',
190 CommentStyle::Hash => ch == '#',
191 CommentStyle::DoubleDash | CommentStyle::Lua => {
192 i + 1 < n && ch == '-' && chars[i + 1].1 == '-'
193 }
194 CommentStyle::Semicolon => ch == ';',
195 CommentStyle::VisualBasic => ch == '\'',
196 CommentStyle::None => false,
197 };
198
199 if is_comment {
200 let rest = &line[chars[i].0..];
201 let kind = if in_ignore {
202 TokenKind::Ignore
203 } else {
204 TokenKind::Comment
205 };
206 tokens.push(make_token(kind, rest, line_num, col, offset!()));
207 break;
208 }
209
210 if ch == '"' || ch == '\'' {
212 let quote = ch;
213 let start_col = col;
214 let start_off = offset!();
215 let mut j = chars[i].0; let str_start = j;
217 col += 1;
218 i += 1;
219 j += 1;
220 while i < n && chars[i].1 != quote {
221 if chars[i].1 == '\\' && i + 1 < n {
222 col += chars[i].1.len_utf8() as u32 + chars[i + 1].1.len_utf8() as u32;
223 i += 2;
224 } else {
225 col += chars[i].1.len_utf8() as u32;
226 i += 1;
227 }
228 }
229 if i < n {
230 col += 1;
231 i += 1;
232 }
233 let str_end = if i < n {
234 chars[i - 1].0 + chars[i - 1].1.len_utf8()
235 } else {
236 line.len()
237 };
238 let _ = (j, str_start); let s = &line[str_start..str_end];
240 let kind = if in_ignore {
241 TokenKind::Ignore
242 } else {
243 TokenKind::Literal
244 };
245 tokens.push(make_token(kind, s, line_num, start_col, start_off));
246 continue;
247 }
248
249 if ch.is_whitespace() {
251 let start_col = col;
252 let start_off = offset!();
253 let byte_start = chars[i].0;
254 while i < n && chars[i].1.is_whitespace() {
255 col += chars[i].1.len_utf8() as u32;
256 i += 1;
257 }
258 let byte_end = if i < n { chars[i].0 } else { line.len() };
259 let kind = if in_ignore {
260 TokenKind::Ignore
261 } else {
262 TokenKind::Whitespace
263 };
264 tokens.push(make_token(
265 kind,
266 &line[byte_start..byte_end],
267 line_num,
268 start_col,
269 start_off,
270 ));
271 continue;
272 }
273
274 if ch.is_ascii_digit() {
276 let start_col = col;
277 let start_off = offset!();
278 let byte_start = chars[i].0;
279 while i < n && (chars[i].1.is_ascii_digit() || chars[i].1 == '.') {
280 col += 1;
281 i += 1;
282 }
283 let byte_end = if i < n { chars[i].0 } else { line.len() };
284 let kind = if in_ignore {
285 TokenKind::Ignore
286 } else {
287 TokenKind::Literal
288 };
289 tokens.push(make_token(
290 kind,
291 &line[byte_start..byte_end],
292 line_num,
293 start_col,
294 start_off,
295 ));
296 continue;
297 }
298
299 if ch.is_alphabetic() || ch == '_' {
301 let start_col = col;
302 let start_off = offset!();
303 let byte_start = chars[i].0;
304 while i < n && (chars[i].1.is_alphanumeric() || chars[i].1 == '_') {
305 col += chars[i].1.len_utf8() as u32;
306 i += 1;
307 }
308 let byte_end = if i < n { chars[i].0 } else { line.len() };
309 let s = &line[byte_start..byte_end];
310 let kind = if in_ignore {
311 TokenKind::Ignore
312 } else {
313 classify_word(s)
314 };
315 tokens.push(make_token(kind, s, line_num, start_col, start_off));
316 continue;
317 }
318
319 let start_col = col;
321 let start_off = offset!();
322 let byte_start = chars[i].0;
323 col += ch.len_utf8() as u32;
324 i += 1;
325 let byte_end = if i < n { chars[i].0 } else { line.len() };
326 let kind = if in_ignore {
327 TokenKind::Ignore
328 } else {
329 TokenKind::Punctuation
330 };
331 tokens.push(make_token(
332 kind,
333 &line[byte_start..byte_end],
334 line_num,
335 start_col,
336 start_off,
337 ));
338 }
339
340 tokens
341}
342
343pub fn tokenize_generic(source: &str, format: &str) -> Vec<Token> {
345 if source.is_empty() {
346 return Vec::new();
347 }
348
349 let style = comment_style(format);
350 let mut tokens = Vec::new();
351 let mut in_ignore = false;
352 let mut in_block_comment = false;
353 let mut offset = 0u32;
354
355 for (line_idx, line) in source.lines().enumerate() {
356 let line_num = line_idx as u32 + 1;
357 let trimmed = line.trim();
358
359 if is_ignore_start(trimmed) {
360 in_ignore = true;
361 }
362 if is_ignore_end(trimmed) {
363 in_ignore = false;
364 offset += line.len() as u32 + 1;
366 continue;
367 }
368
369 let line_tokens = tokenize_line_content(
370 line,
371 line_num,
372 offset,
373 style,
374 in_ignore,
375 &mut in_block_comment,
376 );
377 tokens.extend(line_tokens);
378 offset += line.len() as u32 + 1;
379 }
380
381 tokens
382}
383
384#[cfg(test)]
385mod tests {
386 use super::*;
387
388 #[test]
389 fn python_produces_tokens() {
390 let tokens = tokenize_generic("def hello():\n return 42\n", "python");
391 assert!(!tokens.is_empty());
392 }
393
394 #[test]
395 fn python_hash_comment_marked_as_comment() {
396 let tokens = tokenize_generic("# this is a comment\nx = 1\n", "python");
397 let has_comment = tokens.iter().any(|t| t.kind == TokenKind::Comment);
398 assert!(has_comment, "Python # comments must be Comment kind");
399 }
400
401 #[test]
402 fn go_c_style_comment_recognized() {
403 let tokens = tokenize_generic("// hello\nfunc main() {}\n", "go");
404 let has_comment = tokens.iter().any(|t| t.kind == TokenKind::Comment);
405 assert!(has_comment);
406 }
407
408 #[test]
409 fn empty_input_returns_empty() {
410 let tokens = tokenize_generic("", "python");
411 assert!(
412 tokens.is_empty(),
413 "empty input must return empty vec, not panic"
414 );
415 }
416
417 #[test]
418 fn unknown_format_does_not_panic() {
419 let result =
420 std::panic::catch_unwind(|| tokenize_generic("hello world", "unknown_format_xyz"));
421 assert!(result.is_ok());
422 }
423
424 #[test]
425 fn ignore_region_tokens_marked_as_ignore() {
426 let source = "x = 1\n# jscpd:ignore-start\ny = 2\n# jscpd:ignore-end\nz = 3\n";
427 let tokens = tokenize_generic(source, "python");
428 let has_ignore = tokens.iter().any(|t| t.kind == TokenKind::Ignore);
429 assert!(has_ignore, "tokens in ignore region must be Ignore kind");
430 }
431
432 #[test]
433 fn sql_double_dash_comment_recognized() {
434 let tokens = tokenize_generic("-- a comment\nSELECT * FROM foo;\n", "sql");
435 let has_comment = tokens.iter().any(|t| t.kind == TokenKind::Comment);
436 assert!(has_comment);
437 }
438
439 #[test]
440 fn c_block_comment_recognized() {
441 let tokens = tokenize_generic("/* block */\nint x = 1;\n", "c");
442 let has_comment = tokens.iter().any(|t| t.kind == TokenKind::Comment);
443 assert!(has_comment);
444 }
445
446 #[test]
447 fn location_line_numbers_are_1_based() {
448 let tokens = tokenize_generic("x = 1\ny = 2\n", "python");
449 let first = tokens.first().expect("at least one token");
450 assert_eq!(first.start.line, 1);
451 }
452}