cpd_tokenizer/
javascript.rs1use std::panic::{AssertUnwindSafe, catch_unwind};
5use std::path::Path;
6
7use oxc_allocator::Allocator;
8use oxc_parser::{Kind, Parser, config::TokensParserConfig};
9use oxc_span::SourceType;
10
11use cpd_core::models::{Token, TokenKind};
12
13use crate::line_index::LineIndex;
14
15mod fallback {
18 use super::LineIndex;
19 use cpd_core::models::{Token, TokenKind};
20
21 fn find_ignore_ranges(source: &str) -> Vec<[usize; 2]> {
22 let mut ranges = Vec::new();
23 let mut start: Option<usize> = None;
24 let bytes = source.as_bytes();
25 let mut i = 0;
26 while i < bytes.len() {
27 if i + 1 < bytes.len() && bytes[i] == b'/' {
28 let end = if bytes[i + 1] == b'/' {
29 bytes[i..]
30 .iter()
31 .position(|&b| b == b'\n')
32 .map(|p| i + p)
33 .unwrap_or(bytes.len())
34 } else if bytes[i + 1] == b'*' {
35 bytes[i..]
36 .windows(2)
37 .position(|w| w == b"*/")
38 .map(|p| i + p + 2)
39 .unwrap_or(bytes.len())
40 } else {
41 i += 1;
42 continue;
43 };
44 let comment_text = &source[i..end];
45 if comment_text.contains("jscpd:ignore-start") {
46 start = Some(end);
47 } else if comment_text.contains("jscpd:ignore-end") {
48 if let Some(s) = start.take() {
49 ranges.push([s, i]);
50 }
51 }
52 i = end;
53 continue;
54 }
55 i += 1;
56 }
57 ranges
58 }
59
60 fn in_ignore(offset: usize, end: usize, ranges: &[[usize; 2]]) -> bool {
61 ranges.iter().any(|[rs, re]| offset < *re && end > *rs)
62 }
63
64 pub fn tokenize(source: &str, _format: &str) -> Vec<Token> {
66 let ignore_ranges = find_ignore_ranges(source);
67 let bytes = source.as_bytes();
68 let line_index = LineIndex::new(bytes);
69 let mut tokens = Vec::new();
70 let mut i = 0;
71 while i < bytes.len() {
72 let ch = match source[i..].chars().next() {
73 Some(c) => c,
74 None => break,
75 };
76 if ch.is_whitespace() {
77 i += ch.len_utf8();
78 continue;
79 }
80 if ch.is_alphanumeric() || ch == '_' || ch == '$' {
81 let start = i;
82 while i < bytes.len() {
83 let c = source[i..].chars().next().unwrap_or('\0');
84 if c.is_alphanumeric() || c == '_' || c == '$' {
85 i += c.len_utf8();
86 } else {
87 break;
88 }
89 }
90 let kind = if in_ignore(start, i, &ignore_ranges) {
91 TokenKind::Ignore
92 } else {
93 TokenKind::Other
94 };
95 tokens.push(Token {
96 kind,
97 value: source[start..i].to_string(),
98 start: line_index.location(start),
99 end: line_index.location(i),
100 });
101 } else {
102 let start = i;
103 i += ch.len_utf8();
104 let kind = if in_ignore(start, i, &ignore_ranges) {
105 TokenKind::Ignore
106 } else {
107 TokenKind::Other
108 };
109 tokens.push(Token {
110 kind,
111 value: ch.to_string(),
112 start: line_index.location(start),
113 end: line_index.location(i),
114 });
115 }
116 }
117 tokens
118 }
119}
120
121fn find_ignore_ranges(source: &str) -> Vec<[usize; 2]> {
124 let mut ranges = Vec::new();
125 let mut start: Option<usize> = None;
126 let bytes = source.as_bytes();
127 let mut i = 0;
128 while i < bytes.len() {
129 if i + 1 < bytes.len() && bytes[i] == b'/' {
130 let end = if bytes[i + 1] == b'/' {
131 bytes[i..]
132 .iter()
133 .position(|&b| b == b'\n')
134 .map(|p| i + p)
135 .unwrap_or(bytes.len())
136 } else if bytes[i + 1] == b'*' {
137 bytes[i..]
138 .windows(2)
139 .position(|w| w == b"*/")
140 .map(|p| i + p + 2)
141 .unwrap_or(bytes.len())
142 } else {
143 i += 1;
144 continue;
145 };
146 let comment_text = &source[i..end];
147 if comment_text.contains("jscpd:ignore-start") {
148 start = Some(end);
149 } else if comment_text.contains("jscpd:ignore-end") {
150 if let Some(s) = start.take() {
151 ranges.push([s, i]);
152 }
153 }
154 i = end;
155 continue;
156 }
157 i += 1;
158 }
159 ranges
160}
161
162fn in_ignore(offset: usize, end: usize, ranges: &[[usize; 2]]) -> bool {
163 ranges.iter().any(|[rs, re]| offset < *re && end > *rs)
164}
165
166const fn map_kind(kind: Kind) -> TokenKind {
167 if matches!(kind, Kind::Ident) {
168 return TokenKind::Identifier;
169 }
170 if kind.is_any_keyword() {
171 return TokenKind::Keyword;
172 }
173 if kind.is_literal() {
174 return TokenKind::Literal;
175 }
176 if kind.is_assignment_operator() {
177 return TokenKind::Operator;
178 }
179 if kind.is_binary_operator()
180 || kind.is_logical_operator()
181 || kind.is_unary_operator()
182 || kind.is_update_operator()
183 {
184 return TokenKind::Operator;
185 }
186 match kind {
187 Kind::Arrow => TokenKind::Operator,
188 Kind::Semicolon
189 | Kind::Comma
190 | Kind::Dot
191 | Kind::Dot3
192 | Kind::Colon
193 | Kind::LParen
194 | Kind::RParen
195 | Kind::LCurly
196 | Kind::RCurly
197 | Kind::LBrack
198 | Kind::RBrack
199 | Kind::At => TokenKind::Punctuation,
200 Kind::QuestionDot => TokenKind::Punctuation,
201 _ => TokenKind::Other,
202 }
203}
204
205fn source_type_for_format(format: &str) -> SourceType {
206 let filename = match format {
207 "typescript" => "input.ts",
208 "tsx" => "input.tsx",
209 _ => "input.jsx", };
211 SourceType::from_path(Path::new(filename)).unwrap_or_default()
212}
213
214pub fn tokenize_js(source: &str, format: &str) -> Vec<Token> {
218 if source.is_empty() {
219 return Vec::new();
220 }
221
222 match catch_unwind(AssertUnwindSafe(|| parse_with_oxc(source, format))) {
223 Ok(Some(tokens)) => tokens,
224 Ok(None) => {
225 log::debug!("cpd-tokenizer: OXC parse errors in {format} source, using fallback");
226 fallback::tokenize(source, format)
227 }
228 Err(_) => {
229 log::debug!("cpd-tokenizer: OXC panicked on {format} source, using fallback");
230 fallback::tokenize(source, format)
231 }
232 }
233}
234
235fn parse_with_oxc(source: &str, format: &str) -> Option<Vec<Token>> {
236 let allocator = Allocator::new();
237 let source_type = source_type_for_format(format);
238
239 let parser_return = Parser::new(&allocator, source, source_type)
240 .with_config(TokensParserConfig)
241 .parse();
242
243 if !parser_return.errors.is_empty() {
244 return None;
245 }
246
247 let ignore_ranges = find_ignore_ranges(source);
248 let bytes = source.as_bytes();
249 let line_index = LineIndex::new(bytes);
251
252 let tokens = parser_return
253 .tokens
254 .into_iter()
255 .filter_map(|token| {
256 let start = (token.start() as usize).min(source.len());
257 let end = (token.end() as usize).min(source.len());
258 if start >= end {
259 return None;
260 }
261 let kind = token.kind();
262 if matches!(kind, Kind::Eof | Kind::Undetermined | Kind::Skip) {
263 return None;
264 }
265 let value = &source[start..end];
266 let token_kind = if in_ignore(start, end, &ignore_ranges) {
267 TokenKind::Ignore
268 } else {
269 map_kind(kind)
270 };
271
272 Some(Token {
273 kind: token_kind,
274 value: value.to_string(),
275 start: line_index.location(start),
276 end: line_index.location(end),
277 })
278 })
279 .collect::<Vec<Token>>();
280
281 Some(tokens)
282}
283
284#[cfg(test)]
286mod tests {
287 use super::*;
288
289 #[test]
290 fn valid_js_produces_tokens() {
291 let tokens = tokenize_js("function hello() { return 42; }", "javascript");
292 assert!(!tokens.is_empty(), "valid JS must produce tokens");
293 }
294
295 #[test]
296 fn typescript_produces_tokens() {
297 let tokens = tokenize_js("const x: number = 5;", "typescript");
298 assert!(!tokens.is_empty());
299 }
300
301 #[test]
302 fn malformed_js_does_not_panic() {
303 let result = std::panic::catch_unwind(|| tokenize_js("let x = {{{", "javascript"));
304 assert!(result.is_ok(), "malformed JS must not panic");
305 }
306
307 #[test]
308 fn empty_source_returns_empty() {
309 let tokens = tokenize_js("", "javascript");
310 drop(tokens);
311 }
312
313 #[test]
314 fn ignore_region_tokens_marked_as_ignore() {
315 let source = r#"
316const a = 1;
317// jscpd:ignore-start
318const b = 2;
319// jscpd:ignore-end
320const c = 3;
321"#;
322 let tokens = tokenize_js(source, "javascript");
323 let has_ignore = tokens
324 .iter()
325 .any(|t| t.kind == cpd_core::models::TokenKind::Ignore);
326 assert!(has_ignore, "tokens in ignore region must be marked Ignore");
327 }
328
329 #[test]
330 fn jsx_produces_tokens() {
331 let tokens = tokenize_js("const el = <div>hello</div>;", "jsx");
332 assert!(!tokens.is_empty());
333 }
334
335 #[test]
336 fn tsx_with_type_annotation() {
337 let tokens = tokenize_js("const fn = (x: React.FC): void => {};", "tsx");
338 assert!(!tokens.is_empty());
339 }
340
341 #[test]
342 fn multiline_location_uses_binary_search() {
343 let source = "const a = 1;\nconst b = 2;\nconst c = 3;";
344 let tokens = tokenize_js(source, "javascript");
345 let b_token = tokens.iter().find(|t| t.value == "b");
347 assert!(b_token.is_some(), "must find token b");
348 assert_eq!(b_token.unwrap().start.line, 2, "b must be on line 2");
349 }
350}