1use std::str::FromStr;
2
3use cpd_core::hash::hash_token;
4use cpd_core::models::{DetectionToken, Token, TokenKind};
5
6#[derive(Debug, Clone)]
15pub struct TokenMap {
16 pub format: String,
17 pub tokens: Vec<DetectionToken>,
18}
19
20#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
21pub enum Mode {
22 #[default]
23 Mild,
24 Weak,
25 Strict,
26}
27
28impl FromStr for Mode {
29 type Err = ();
30
31 fn from_str(s: &str) -> Result<Self, Self::Err> {
32 match s {
33 "weak" => Ok(Self::Weak),
34 "strict" => Ok(Self::Strict),
35 _ => Ok(Self::Mild),
36 }
37 }
38}
39
40#[derive(Debug, Clone)]
45pub struct TokenizeOptions {
46 pub mode: Mode,
47 pub ignore_case: bool,
49 pub ignore_ranges: Vec<[usize; 2]>,
52}
53
54impl TokenizeOptions {
55 pub fn new(mode: Mode) -> Self {
56 Self {
57 mode,
58 ignore_case: false,
59 ignore_ranges: Vec::new(),
60 }
61 }
62}
63
64#[allow(clippy::too_many_arguments)]
74#[inline]
75pub fn push_token(
76 tokens: &mut Vec<DetectionToken>,
77 kind: TokenKind,
78 value: &str,
79 byte_start: usize,
80 byte_end: usize,
81 start: cpd_core::models::Location,
82 end: cpd_core::models::Location,
83 options: &TokenizeOptions,
84) {
85 if kind == TokenKind::Ignore {
87 return;
88 }
89 if options
91 .ignore_ranges
92 .iter()
93 .any(|[rs, re]| byte_start < *re && byte_end > *rs)
94 {
95 return;
96 }
97 match options.mode {
99 Mode::Mild => {
100 if kind == TokenKind::Whitespace {
101 return;
102 }
103 }
104 Mode::Weak => {
105 if matches!(
106 kind,
107 TokenKind::Whitespace | TokenKind::Comment | TokenKind::BlockComment
108 ) {
109 return;
110 }
111 }
112 Mode::Strict => {} }
114 tokens.push(DetectionToken {
115 hash: hash_token(kind.discriminant(), value, options.ignore_case),
116 start,
117 end,
118 range: [byte_start, byte_end],
119 });
120}
121
122pub fn tokenize(format: &str, source: &str, mode: Mode) -> Vec<Token> {
128 let raw = dispatch_tokenizer(format, source, mode);
129 raw.into_iter().filter(|t| keep_token(t, mode)).collect()
132}
133
134fn keep_token(token: &Token, mode: Mode) -> bool {
135 if token.kind == TokenKind::Ignore {
136 return false;
137 }
138 match mode {
139 Mode::Mild => !matches!(token.kind, TokenKind::Whitespace),
140 Mode::Weak => !matches!(
141 token.kind,
142 TokenKind::Whitespace | TokenKind::Comment | TokenKind::BlockComment
143 ),
144 Mode::Strict => true,
145 }
146}
147
148pub fn tokenize_to_detection(
158 format: &str,
159 source: &str,
160 options: &TokenizeOptions,
161) -> Vec<DetectionToken> {
162 let raw = dispatch_tokenizer(format, source, options.mode);
170 let mut detection = Vec::with_capacity(raw.len());
171 for t in raw {
172 let byte_start = t.start.offset as usize;
173 let byte_end = t.end.offset as usize;
174 push_token(
175 &mut detection,
176 t.kind,
177 &t.value,
178 byte_start,
179 byte_end,
180 t.start,
181 t.end,
182 options,
183 );
184 }
185 detection
186}
187
188fn dispatch_tokenizer(format: &str, source: &str, mode: Mode) -> Vec<Token> {
189 match format {
190 "javascript" | "typescript" | "jsx" | "tsx" => {
191 crate::javascript::tokenize_js(source, format)
192 }
193 "vue" | "svelte" | "astro" => crate::sfc::tokenize_sfc(source, format, mode),
194 "markdown" | "md" => crate::markdown::tokenize_markdown(source, mode),
195 _ => crate::generic::tokenize_generic(source, format),
196 }
197}
198
199pub fn tokenize_to_detection_maps(
208 format: &str,
209 source: &str,
210 options: &TokenizeOptions,
211) -> Vec<TokenMap> {
212 match format {
213 "markdown" | "md" => crate::markdown::tokenize_markdown_maps(source, options),
214 "vue" | "svelte" | "astro" => crate::sfc::tokenize_sfc_maps(source, format, options),
215 _ => {
216 let tokens = tokenize_to_detection(format, source, options);
217 vec![TokenMap {
218 format: format.to_string(),
219 tokens,
220 }]
221 }
222 }
223}
224
225#[cfg(test)]
226mod tests {
227 use super::*;
228
229 #[test]
230 fn mode_from_str_defaults_to_mild() {
231 assert_eq!("unknown".parse::<Mode>().unwrap(), Mode::Mild);
232 assert_eq!("mild".parse::<Mode>().unwrap(), Mode::Mild);
233 }
234
235 #[test]
236 fn mode_from_str_weak() {
237 assert_eq!("weak".parse::<Mode>().unwrap(), Mode::Weak);
238 }
239
240 #[test]
241 fn mode_from_str_strict() {
242 assert_eq!("strict".parse::<Mode>().unwrap(), Mode::Strict);
243 }
244
245 #[test]
246 fn tokenize_to_detection_returns_detection_tokens() {
247 let opts = TokenizeOptions::new(Mode::Mild);
248 let tokens = tokenize_to_detection("javascript", "function hello() { return 42; }", &opts);
249 assert!(
250 !tokens.is_empty(),
251 "must produce DetectionTokens for valid JS"
252 );
253 }
254
255 #[test]
256 fn tokenize_to_detection_mild_excludes_whitespace() {
257 let opts = TokenizeOptions::new(Mode::Mild);
258 let mild = tokenize_to_detection("javascript", "a b c", &opts);
262 let strict =
263 tokenize_to_detection("javascript", "a b c", &TokenizeOptions::new(Mode::Strict));
264 let _ = (mild, strict);
268 }
269
270 #[test]
271 fn push_token_drops_ignore_kind() {
272 let mut tokens = Vec::new();
273 let loc = cpd_core::models::Location {
274 line: 1,
275 column: 0,
276 offset: 0,
277 };
278 let opts = TokenizeOptions::new(Mode::Mild);
279 push_token(
280 &mut tokens,
281 TokenKind::Ignore,
282 "secret",
283 0,
284 6,
285 loc.clone(),
286 loc,
287 &opts,
288 );
289 assert!(tokens.is_empty(), "Ignore-kind tokens must be dropped");
290 }
291
292 #[test]
293 fn push_token_drops_whitespace_in_mild_mode() {
294 let mut tokens = Vec::new();
295 let loc = cpd_core::models::Location {
296 line: 1,
297 column: 0,
298 offset: 0,
299 };
300 let opts = TokenizeOptions::new(Mode::Mild);
301 push_token(
302 &mut tokens,
303 TokenKind::Whitespace,
304 " ",
305 0,
306 1,
307 loc.clone(),
308 loc,
309 &opts,
310 );
311 assert!(tokens.is_empty(), "Whitespace must be dropped in Mild mode");
312 }
313
314 #[test]
315 fn push_token_keeps_whitespace_in_strict_mode() {
316 let mut tokens = Vec::new();
317 let loc = cpd_core::models::Location {
318 line: 1,
319 column: 0,
320 offset: 0,
321 };
322 let opts = TokenizeOptions::new(Mode::Strict);
323 push_token(
324 &mut tokens,
325 TokenKind::Whitespace,
326 " ",
327 0,
328 1,
329 loc.clone(),
330 loc,
331 &opts,
332 );
333 assert_eq!(tokens.len(), 1, "Whitespace must be kept in Strict mode");
334 }
335
336 #[test]
337 fn push_token_drops_comment_in_weak_mode() {
338 let mut tokens = Vec::new();
339 let loc = cpd_core::models::Location {
340 line: 1,
341 column: 0,
342 offset: 0,
343 };
344 let opts = TokenizeOptions::new(Mode::Weak);
345 push_token(
346 &mut tokens,
347 TokenKind::Comment,
348 "// note",
349 0,
350 7,
351 loc.clone(),
352 loc,
353 &opts,
354 );
355 assert!(tokens.is_empty(), "Comment must be dropped in Weak mode");
356 }
357
358 #[test]
359 fn push_token_ignore_case_folds_hash() {
360 let mut t1 = Vec::new();
361 let mut t2 = Vec::new();
362 let loc = cpd_core::models::Location {
363 line: 1,
364 column: 0,
365 offset: 0,
366 };
367 let mut opts = TokenizeOptions::new(Mode::Mild);
368 opts.ignore_case = true;
369 push_token(
370 &mut t1,
371 TokenKind::Identifier,
372 "Hello",
373 0,
374 5,
375 loc.clone(),
376 loc.clone(),
377 &opts,
378 );
379 push_token(
380 &mut t2,
381 TokenKind::Identifier,
382 "hello",
383 0,
384 5,
385 loc.clone(),
386 loc,
387 &opts,
388 );
389 assert_eq!(t1[0].hash, t2[0].hash, "ignore_case must fold case in hash");
390 }
391}