1use xxhash_rust::xxh3::xxh3_64;
2
3use super::tokenize::{SourceToken, TokenKind};
4use fallow_config::{DetectionMode, NormalizationConfig, ResolvedNormalization};
5
6#[derive(Debug, Clone)]
8pub struct HashedToken {
9 pub hash: u64,
11 pub original_index: usize,
13}
14
15#[must_use]
20pub fn normalize_and_hash(tokens: &[SourceToken], mode: DetectionMode) -> Vec<HashedToken> {
21 let resolved = ResolvedNormalization::resolve(mode, &NormalizationConfig::default());
22 normalize_and_hash_resolved(tokens, resolved)
23}
24
25#[must_use]
29pub fn normalize_and_hash_resolved(
30 tokens: &[SourceToken],
31 normalization: ResolvedNormalization,
32) -> Vec<HashedToken> {
33 let mut result = Vec::with_capacity(tokens.len());
34
35 for (i, token) in tokens.iter().enumerate() {
36 let hash = hash_token_resolved(&token.kind, normalization);
37 result.push(HashedToken {
38 hash,
39 original_index: i,
40 });
41 }
42
43 result
44}
45
46fn hash_token_resolved(kind: &TokenKind, norm: ResolvedNormalization) -> u64 {
48 match kind {
49 TokenKind::Keyword(kw) => hash_bytes(&[0, *kw as u8]),
50 TokenKind::Identifier(name) => {
51 if norm.ignore_identifiers {
52 hash_bytes(&[1, 0])
53 } else {
54 let mut buf = vec![1];
55 buf.extend_from_slice(name.as_bytes());
56 hash_bytes(&buf)
57 }
58 }
59 TokenKind::StringLiteral(val) => {
60 if norm.ignore_string_values {
61 hash_bytes(&[2, 0])
62 } else {
63 let mut buf = vec![2];
64 buf.extend_from_slice(val.as_bytes());
65 hash_bytes(&buf)
66 }
67 }
68 TokenKind::NumericLiteral(val) => {
69 if norm.ignore_numeric_values {
70 hash_bytes(&[3, 0])
71 } else {
72 let mut buf = vec![3];
73 buf.extend_from_slice(val.as_bytes());
74 hash_bytes(&buf)
75 }
76 }
77 TokenKind::BooleanLiteral(val) => hash_bytes(&[4, u8::from(*val)]),
78 TokenKind::NullLiteral => hash_bytes(&[5]),
79 TokenKind::TemplateLiteral => hash_bytes(&[6]),
80 TokenKind::RegExpLiteral => hash_bytes(&[7]),
81 TokenKind::Operator(op) => hash_bytes(&[8, *op as u8]),
82 TokenKind::Punctuation(p) => hash_bytes(&[9, *p as u8]),
83 }
84}
85
86fn hash_bytes(data: &[u8]) -> u64 {
88 xxh3_64(data)
89}
90
91#[cfg(test)]
92mod tests {
93 use super::*;
94 use crate::duplicates::tokenize::{KeywordType, OperatorType, PunctuationType};
95 use oxc_span::Span;
96
97 fn make_token(kind: TokenKind) -> SourceToken {
98 SourceToken {
99 kind,
100 span: Span::new(0, 0),
101 }
102 }
103
104 #[test]
105 fn strict_mode_preserves_identifiers() {
106 let tokens = vec![
107 make_token(TokenKind::Identifier("foo".to_string())),
108 make_token(TokenKind::Identifier("bar".to_string())),
109 ];
110
111 let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
112 assert_eq!(hashed.len(), 2);
113 assert_ne!(hashed[0].hash, hashed[1].hash);
115 }
116
117 #[test]
118 fn semantic_mode_blinds_identifiers() {
119 let tokens = vec![
120 make_token(TokenKind::Identifier("foo".to_string())),
121 make_token(TokenKind::Identifier("bar".to_string())),
122 ];
123
124 let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
125 assert_eq!(hashed.len(), 2);
126 assert_eq!(hashed[0].hash, hashed[1].hash);
128 }
129
130 #[test]
131 fn semantic_mode_blinds_string_literals() {
132 let tokens = vec![
133 make_token(TokenKind::StringLiteral("hello".to_string())),
134 make_token(TokenKind::StringLiteral("world".to_string())),
135 ];
136
137 let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
138 assert_eq!(hashed.len(), 2);
139 assert_eq!(hashed[0].hash, hashed[1].hash);
140 }
141
142 #[test]
143 fn semantic_mode_blinds_numeric_literals() {
144 let tokens = vec![
145 make_token(TokenKind::NumericLiteral("42".to_string())),
146 make_token(TokenKind::NumericLiteral("99".to_string())),
147 ];
148
149 let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
150 assert_eq!(hashed.len(), 2);
151 assert_eq!(hashed[0].hash, hashed[1].hash);
152 }
153
154 #[test]
155 fn semantic_mode_preserves_booleans() {
156 let tokens = vec![
157 make_token(TokenKind::BooleanLiteral(true)),
158 make_token(TokenKind::BooleanLiteral(false)),
159 ];
160
161 let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
162 assert_eq!(hashed.len(), 2);
163 assert_ne!(hashed[0].hash, hashed[1].hash);
164 }
165
166 #[test]
167 fn semantic_mode_preserves_keywords() {
168 let tokens = vec![
169 make_token(TokenKind::Keyword(KeywordType::If)),
170 make_token(TokenKind::Keyword(KeywordType::While)),
171 ];
172
173 let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
174 assert_eq!(hashed.len(), 2);
175 assert_ne!(hashed[0].hash, hashed[1].hash);
176 }
177
178 #[test]
179 fn preserves_original_indices() {
180 let tokens = vec![
181 make_token(TokenKind::Keyword(KeywordType::Const)),
182 make_token(TokenKind::Identifier("x".to_string())),
183 make_token(TokenKind::Operator(OperatorType::Assign)),
184 ];
185
186 let hashed = normalize_and_hash(&tokens, DetectionMode::Mild);
187 assert_eq!(hashed.len(), 3);
188 assert_eq!(hashed[0].original_index, 0);
189 assert_eq!(hashed[1].original_index, 1);
190 assert_eq!(hashed[2].original_index, 2);
191 }
192
193 #[test]
194 fn empty_input_produces_empty_output() {
195 let tokens: Vec<SourceToken> = vec![];
196 let hashed = normalize_and_hash(&tokens, DetectionMode::Mild);
197 assert!(hashed.is_empty());
198 }
199
200 #[test]
201 fn operators_have_distinct_hashes() {
202 let tokens = vec![
203 make_token(TokenKind::Operator(OperatorType::Add)),
204 make_token(TokenKind::Operator(OperatorType::Sub)),
205 ];
206
207 let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
208 assert_ne!(hashed[0].hash, hashed[1].hash);
209 }
210
211 #[test]
212 fn punctuation_has_distinct_hashes() {
213 let tokens = vec![
214 make_token(TokenKind::Punctuation(PunctuationType::OpenParen)),
215 make_token(TokenKind::Punctuation(PunctuationType::CloseParen)),
216 ];
217
218 let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
219 assert_ne!(hashed[0].hash, hashed[1].hash);
220 }
221
222 #[test]
225 fn null_literal_has_stable_hash() {
226 let tokens = vec![make_token(TokenKind::NullLiteral)];
227 let h1 = normalize_and_hash(&tokens, DetectionMode::Strict);
228 let h2 = normalize_and_hash(&tokens, DetectionMode::Semantic);
229 assert_eq!(h1[0].hash, h2[0].hash);
231 }
232
233 #[test]
234 fn template_literal_has_stable_hash() {
235 let tokens = vec![make_token(TokenKind::TemplateLiteral)];
236 let h1 = normalize_and_hash(&tokens, DetectionMode::Strict);
237 let h2 = normalize_and_hash(&tokens, DetectionMode::Semantic);
238 assert_eq!(h1[0].hash, h2[0].hash);
239 }
240
241 #[test]
242 fn regexp_literal_has_stable_hash() {
243 let tokens = vec![make_token(TokenKind::RegExpLiteral)];
244 let h1 = normalize_and_hash(&tokens, DetectionMode::Strict);
245 let h2 = normalize_and_hash(&tokens, DetectionMode::Semantic);
246 assert_eq!(h1[0].hash, h2[0].hash);
247 }
248
249 #[test]
250 fn null_template_regexp_have_distinct_hashes() {
251 let tokens = vec![
252 make_token(TokenKind::NullLiteral),
253 make_token(TokenKind::TemplateLiteral),
254 make_token(TokenKind::RegExpLiteral),
255 ];
256 let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
257 assert_ne!(hashed[0].hash, hashed[1].hash);
258 assert_ne!(hashed[1].hash, hashed[2].hash);
259 assert_ne!(hashed[0].hash, hashed[2].hash);
260 }
261
262 #[test]
263 fn mild_mode_equivalent_to_strict() {
264 let id_tokens = vec![
266 make_token(TokenKind::Identifier("foo".to_string())),
267 make_token(TokenKind::Identifier("bar".to_string())),
268 ];
269 let hashed = normalize_and_hash(&id_tokens, DetectionMode::Mild);
270 assert_ne!(hashed[0].hash, hashed[1].hash);
272
273 let str_tokens = vec![
274 make_token(TokenKind::StringLiteral("hello".to_string())),
275 make_token(TokenKind::StringLiteral("world".to_string())),
276 ];
277 let hashed = normalize_and_hash(&str_tokens, DetectionMode::Mild);
278 assert_ne!(hashed[0].hash, hashed[1].hash);
280
281 let num_tokens = vec![
282 make_token(TokenKind::NumericLiteral("42".to_string())),
283 make_token(TokenKind::NumericLiteral("99".to_string())),
284 ];
285 let hashed = normalize_and_hash(&num_tokens, DetectionMode::Mild);
286 assert_ne!(hashed[0].hash, hashed[1].hash);
288 }
289
290 #[test]
291 fn weak_mode_blinds_strings_only() {
292 let id_tokens = vec![
293 make_token(TokenKind::Identifier("foo".to_string())),
294 make_token(TokenKind::Identifier("bar".to_string())),
295 ];
296 let hashed = normalize_and_hash(&id_tokens, DetectionMode::Weak);
297 assert_ne!(hashed[0].hash, hashed[1].hash, "Weak preserves identifiers");
298
299 let num_tokens = vec![
300 make_token(TokenKind::NumericLiteral("42".to_string())),
301 make_token(TokenKind::NumericLiteral("99".to_string())),
302 ];
303 let hashed = normalize_and_hash(&num_tokens, DetectionMode::Weak);
304 assert_ne!(hashed[0].hash, hashed[1].hash, "Weak preserves numbers");
305 }
306
307 #[test]
308 fn different_token_kinds_produce_distinct_hashes() {
309 let tokens = vec![
311 make_token(TokenKind::Keyword(KeywordType::Const)),
312 make_token(TokenKind::Identifier("x".to_string())),
313 make_token(TokenKind::StringLiteral("x".to_string())),
314 make_token(TokenKind::NumericLiteral("1".to_string())),
315 make_token(TokenKind::BooleanLiteral(true)),
316 make_token(TokenKind::NullLiteral),
317 make_token(TokenKind::TemplateLiteral),
318 make_token(TokenKind::RegExpLiteral),
319 make_token(TokenKind::Operator(OperatorType::Add)),
320 make_token(TokenKind::Punctuation(PunctuationType::OpenParen)),
321 ];
322 let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
323 for i in 0..hashed.len() {
325 for j in (i + 1)..hashed.len() {
326 assert_ne!(
327 hashed[i].hash, hashed[j].hash,
328 "Token at index {i} and {j} should have distinct hashes"
329 );
330 }
331 }
332 }
333
334 #[test]
337 fn resolved_strict_with_ignore_identifiers_override() {
338 let norm = ResolvedNormalization {
340 ignore_identifiers: true,
341 ignore_string_values: false,
342 ignore_numeric_values: false,
343 };
344 let tokens = vec![
345 make_token(TokenKind::Identifier("foo".to_string())),
346 make_token(TokenKind::Identifier("bar".to_string())),
347 ];
348
349 let hashed = normalize_and_hash_resolved(&tokens, norm);
350 assert_eq!(hashed.len(), 2);
351 assert_eq!(hashed[0].hash, hashed[1].hash);
353 }
354
355 #[test]
356 fn resolved_strict_with_ignore_strings_override() {
357 let norm = ResolvedNormalization {
358 ignore_identifiers: false,
359 ignore_string_values: true,
360 ignore_numeric_values: false,
361 };
362 let tokens = vec![
363 make_token(TokenKind::StringLiteral("hello".to_string())),
364 make_token(TokenKind::StringLiteral("world".to_string())),
365 ];
366
367 let hashed = normalize_and_hash_resolved(&tokens, norm);
368 assert_eq!(hashed[0].hash, hashed[1].hash);
369 }
370
371 #[test]
372 fn resolved_strict_with_ignore_numbers_override() {
373 let norm = ResolvedNormalization {
374 ignore_identifiers: false,
375 ignore_string_values: false,
376 ignore_numeric_values: true,
377 };
378 let tokens = vec![
379 make_token(TokenKind::NumericLiteral("42".to_string())),
380 make_token(TokenKind::NumericLiteral("99".to_string())),
381 ];
382
383 let hashed = normalize_and_hash_resolved(&tokens, norm);
384 assert_eq!(hashed[0].hash, hashed[1].hash);
385 }
386
387 #[test]
388 fn resolved_semantic_with_preserve_identifiers_override() {
389 let norm = ResolvedNormalization {
391 ignore_identifiers: false,
392 ignore_string_values: true,
393 ignore_numeric_values: true,
394 };
395 let tokens = vec![
396 make_token(TokenKind::Identifier("foo".to_string())),
397 make_token(TokenKind::Identifier("bar".to_string())),
398 ];
399
400 let hashed = normalize_and_hash_resolved(&tokens, norm);
401 assert_ne!(hashed[0].hash, hashed[1].hash);
403 }
404
405 #[test]
406 fn resolved_normalization_from_mode_defaults() {
407 use fallow_config::NormalizationConfig;
408
409 let norm =
411 ResolvedNormalization::resolve(DetectionMode::Strict, &NormalizationConfig::default());
412 assert!(!norm.ignore_identifiers);
413 assert!(!norm.ignore_string_values);
414 assert!(!norm.ignore_numeric_values);
415
416 let norm =
418 ResolvedNormalization::resolve(DetectionMode::Weak, &NormalizationConfig::default());
419 assert!(!norm.ignore_identifiers);
420 assert!(norm.ignore_string_values);
421 assert!(!norm.ignore_numeric_values);
422
423 let norm = ResolvedNormalization::resolve(
425 DetectionMode::Semantic,
426 &NormalizationConfig::default(),
427 );
428 assert!(norm.ignore_identifiers);
429 assert!(norm.ignore_string_values);
430 assert!(norm.ignore_numeric_values);
431 }
432
433 #[test]
434 fn resolved_normalization_overrides_mode_defaults() {
435 use fallow_config::NormalizationConfig;
436
437 let overrides = NormalizationConfig {
439 ignore_identifiers: Some(true),
440 ignore_string_values: None, ignore_numeric_values: None,
442 };
443 let norm = ResolvedNormalization::resolve(DetectionMode::Strict, &overrides);
444 assert!(norm.ignore_identifiers); assert!(!norm.ignore_string_values); assert!(!norm.ignore_numeric_values); }
448
449 mod proptests {
450 use super::*;
451 use crate::duplicates::tokenize::{KeywordType, OperatorType, PunctuationType};
452 use oxc_span::Span;
453 use proptest::prelude::*;
454
455 fn make_token(kind: TokenKind) -> SourceToken {
456 SourceToken {
457 kind,
458 span: Span::new(0, 0),
459 }
460 }
461
462 fn arb_detection_mode() -> impl Strategy<Value = DetectionMode> {
463 prop::sample::select(vec![
464 DetectionMode::Strict,
465 DetectionMode::Mild,
466 DetectionMode::Weak,
467 DetectionMode::Semantic,
468 ])
469 }
470
471 fn arb_normalization() -> impl Strategy<Value = ResolvedNormalization> {
472 (any::<bool>(), any::<bool>(), any::<bool>()).prop_map(|(ids, strings, nums)| {
473 ResolvedNormalization {
474 ignore_identifiers: ids,
475 ignore_string_values: strings,
476 ignore_numeric_values: nums,
477 }
478 })
479 }
480
481 fn arb_token_kind() -> impl Strategy<Value = TokenKind> {
482 prop_oneof![
483 Just(TokenKind::Keyword(KeywordType::Const)),
484 Just(TokenKind::Keyword(KeywordType::If)),
485 Just(TokenKind::Keyword(KeywordType::Return)),
486 "[a-zA-Z_][a-zA-Z0-9_]{0,30}".prop_map(TokenKind::Identifier),
487 "[a-zA-Z0-9 _.,!?]{0,50}".prop_map(TokenKind::StringLiteral),
488 "[0-9]{1,10}(\\.[0-9]{1,5})?".prop_map(TokenKind::NumericLiteral),
489 any::<bool>().prop_map(TokenKind::BooleanLiteral),
490 Just(TokenKind::NullLiteral),
491 Just(TokenKind::TemplateLiteral),
492 Just(TokenKind::RegExpLiteral),
493 Just(TokenKind::Operator(OperatorType::Add)),
494 Just(TokenKind::Operator(OperatorType::Assign)),
495 Just(TokenKind::Punctuation(PunctuationType::OpenParen)),
496 Just(TokenKind::Punctuation(PunctuationType::CloseParen)),
497 ]
498 }
499
500 proptest! {
501 #[test]
503 fn normalization_is_idempotent(
504 kind in arb_token_kind(),
505 norm in arb_normalization(),
506 ) {
507 let token = make_token(kind);
508 let first = normalize_and_hash_resolved(std::slice::from_ref(&token), norm);
509 let second = normalize_and_hash_resolved(&[token], norm);
512 prop_assert_eq!(first.len(), second.len());
513 for (a, b) in first.iter().zip(second.iter()) {
514 prop_assert_eq!(a.hash, b.hash, "Normalization should be idempotent");
515 }
516 }
517
518 #[test]
520 fn normalization_is_deterministic(
521 kinds in prop::collection::vec(arb_token_kind(), 1..20),
522 mode in arb_detection_mode(),
523 ) {
524 let tokens: Vec<SourceToken> = kinds.into_iter().map(make_token).collect();
525 let result1 = normalize_and_hash(&tokens, mode);
526 let result2 = normalize_and_hash(&tokens, mode);
527 prop_assert_eq!(result1.len(), result2.len());
528 for (a, b) in result1.iter().zip(result2.iter()) {
529 prop_assert_eq!(a.hash, b.hash, "Same input must produce same hash");
530 prop_assert_eq!(a.original_index, b.original_index);
531 }
532 }
533
534 #[test]
536 fn output_length_matches_input(
537 kinds in prop::collection::vec(arb_token_kind(), 0..30),
538 mode in arb_detection_mode(),
539 ) {
540 let tokens: Vec<SourceToken> = kinds.into_iter().map(make_token).collect();
541 let result = normalize_and_hash(&tokens, mode);
542 prop_assert_eq!(
543 result.len(), tokens.len(),
544 "Output should have same length as input"
545 );
546 }
547
548 #[test]
550 fn original_indices_are_sequential(
551 kinds in prop::collection::vec(arb_token_kind(), 1..20),
552 norm in arb_normalization(),
553 ) {
554 let tokens: Vec<SourceToken> = kinds.into_iter().map(make_token).collect();
555 let result = normalize_and_hash_resolved(&tokens, norm);
556 for (i, hashed) in result.iter().enumerate() {
557 prop_assert_eq!(hashed.original_index, i);
558 }
559 }
560 }
561 }
562}