1use xxhash_rust::xxh3::xxh3_64;
2
3use super::tokenize::{SourceToken, TokenKind};
4use fallow_config::{DetectionMode, NormalizationConfig, ResolvedNormalization};
5
6#[derive(Debug, Clone)]
8pub struct HashedToken {
9 pub hash: u64,
11 pub original_index: usize,
13}
14
15pub fn normalize_and_hash(tokens: &[SourceToken], mode: DetectionMode) -> Vec<HashedToken> {
20 let resolved = ResolvedNormalization::resolve(mode, &NormalizationConfig::default());
21 normalize_and_hash_resolved(tokens, resolved)
22}
23
24pub fn normalize_and_hash_resolved(
28 tokens: &[SourceToken],
29 normalization: ResolvedNormalization,
30) -> Vec<HashedToken> {
31 let mut result = Vec::with_capacity(tokens.len());
32
33 for (i, token) in tokens.iter().enumerate() {
34 let hash = hash_token_resolved(&token.kind, normalization);
35 result.push(HashedToken {
36 hash,
37 original_index: i,
38 });
39 }
40
41 result
42}
43
44fn hash_token_resolved(kind: &TokenKind, norm: ResolvedNormalization) -> u64 {
46 match kind {
47 TokenKind::Keyword(kw) => hash_bytes(&[0, *kw as u8]),
48 TokenKind::Identifier(name) => {
49 if norm.ignore_identifiers {
50 hash_bytes(&[1, 0])
51 } else {
52 let mut buf = vec![1];
53 buf.extend_from_slice(name.as_bytes());
54 hash_bytes(&buf)
55 }
56 }
57 TokenKind::StringLiteral(val) => {
58 if norm.ignore_string_values {
59 hash_bytes(&[2, 0])
60 } else {
61 let mut buf = vec![2];
62 buf.extend_from_slice(val.as_bytes());
63 hash_bytes(&buf)
64 }
65 }
66 TokenKind::NumericLiteral(val) => {
67 if norm.ignore_numeric_values {
68 hash_bytes(&[3, 0])
69 } else {
70 let mut buf = vec![3];
71 buf.extend_from_slice(val.as_bytes());
72 hash_bytes(&buf)
73 }
74 }
75 TokenKind::BooleanLiteral(val) => hash_bytes(&[4, u8::from(*val)]),
76 TokenKind::NullLiteral => hash_bytes(&[5]),
77 TokenKind::TemplateLiteral => hash_bytes(&[6]),
78 TokenKind::RegExpLiteral => hash_bytes(&[7]),
79 TokenKind::Operator(op) => hash_bytes(&[8, *op as u8]),
80 TokenKind::Punctuation(p) => hash_bytes(&[9, *p as u8]),
81 }
82}
83
84fn hash_bytes(data: &[u8]) -> u64 {
86 xxh3_64(data)
87}
88
89#[cfg(test)]
90mod tests {
91 use super::*;
92 use crate::duplicates::tokenize::{KeywordType, OperatorType, PunctuationType};
93 use oxc_span::Span;
94
95 fn make_token(kind: TokenKind) -> SourceToken {
96 SourceToken {
97 kind,
98 span: Span::new(0, 0),
99 }
100 }
101
102 #[test]
103 fn strict_mode_preserves_identifiers() {
104 let tokens = vec![
105 make_token(TokenKind::Identifier("foo".to_string())),
106 make_token(TokenKind::Identifier("bar".to_string())),
107 ];
108
109 let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
110 assert_eq!(hashed.len(), 2);
111 assert_ne!(hashed[0].hash, hashed[1].hash);
113 }
114
115 #[test]
116 fn semantic_mode_blinds_identifiers() {
117 let tokens = vec![
118 make_token(TokenKind::Identifier("foo".to_string())),
119 make_token(TokenKind::Identifier("bar".to_string())),
120 ];
121
122 let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
123 assert_eq!(hashed.len(), 2);
124 assert_eq!(hashed[0].hash, hashed[1].hash);
126 }
127
128 #[test]
129 fn semantic_mode_blinds_string_literals() {
130 let tokens = vec![
131 make_token(TokenKind::StringLiteral("hello".to_string())),
132 make_token(TokenKind::StringLiteral("world".to_string())),
133 ];
134
135 let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
136 assert_eq!(hashed.len(), 2);
137 assert_eq!(hashed[0].hash, hashed[1].hash);
138 }
139
140 #[test]
141 fn semantic_mode_blinds_numeric_literals() {
142 let tokens = vec![
143 make_token(TokenKind::NumericLiteral("42".to_string())),
144 make_token(TokenKind::NumericLiteral("99".to_string())),
145 ];
146
147 let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
148 assert_eq!(hashed.len(), 2);
149 assert_eq!(hashed[0].hash, hashed[1].hash);
150 }
151
152 #[test]
153 fn semantic_mode_preserves_booleans() {
154 let tokens = vec![
155 make_token(TokenKind::BooleanLiteral(true)),
156 make_token(TokenKind::BooleanLiteral(false)),
157 ];
158
159 let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
160 assert_eq!(hashed.len(), 2);
161 assert_ne!(hashed[0].hash, hashed[1].hash);
162 }
163
164 #[test]
165 fn semantic_mode_preserves_keywords() {
166 let tokens = vec![
167 make_token(TokenKind::Keyword(KeywordType::If)),
168 make_token(TokenKind::Keyword(KeywordType::While)),
169 ];
170
171 let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
172 assert_eq!(hashed.len(), 2);
173 assert_ne!(hashed[0].hash, hashed[1].hash);
174 }
175
176 #[test]
177 fn preserves_original_indices() {
178 let tokens = vec![
179 make_token(TokenKind::Keyword(KeywordType::Const)),
180 make_token(TokenKind::Identifier("x".to_string())),
181 make_token(TokenKind::Operator(OperatorType::Assign)),
182 ];
183
184 let hashed = normalize_and_hash(&tokens, DetectionMode::Mild);
185 assert_eq!(hashed.len(), 3);
186 assert_eq!(hashed[0].original_index, 0);
187 assert_eq!(hashed[1].original_index, 1);
188 assert_eq!(hashed[2].original_index, 2);
189 }
190
191 #[test]
192 fn empty_input_produces_empty_output() {
193 let tokens: Vec<SourceToken> = vec![];
194 let hashed = normalize_and_hash(&tokens, DetectionMode::Mild);
195 assert!(hashed.is_empty());
196 }
197
198 #[test]
199 fn operators_have_distinct_hashes() {
200 let tokens = vec![
201 make_token(TokenKind::Operator(OperatorType::Add)),
202 make_token(TokenKind::Operator(OperatorType::Sub)),
203 ];
204
205 let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
206 assert_ne!(hashed[0].hash, hashed[1].hash);
207 }
208
209 #[test]
210 fn punctuation_has_distinct_hashes() {
211 let tokens = vec![
212 make_token(TokenKind::Punctuation(PunctuationType::OpenParen)),
213 make_token(TokenKind::Punctuation(PunctuationType::CloseParen)),
214 ];
215
216 let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
217 assert_ne!(hashed[0].hash, hashed[1].hash);
218 }
219
220 #[test]
223 fn null_literal_has_stable_hash() {
224 let tokens = vec![make_token(TokenKind::NullLiteral)];
225 let h1 = normalize_and_hash(&tokens, DetectionMode::Strict);
226 let h2 = normalize_and_hash(&tokens, DetectionMode::Semantic);
227 assert_eq!(h1[0].hash, h2[0].hash);
229 }
230
231 #[test]
232 fn template_literal_has_stable_hash() {
233 let tokens = vec![make_token(TokenKind::TemplateLiteral)];
234 let h1 = normalize_and_hash(&tokens, DetectionMode::Strict);
235 let h2 = normalize_and_hash(&tokens, DetectionMode::Semantic);
236 assert_eq!(h1[0].hash, h2[0].hash);
237 }
238
239 #[test]
240 fn regexp_literal_has_stable_hash() {
241 let tokens = vec![make_token(TokenKind::RegExpLiteral)];
242 let h1 = normalize_and_hash(&tokens, DetectionMode::Strict);
243 let h2 = normalize_and_hash(&tokens, DetectionMode::Semantic);
244 assert_eq!(h1[0].hash, h2[0].hash);
245 }
246
247 #[test]
248 fn null_template_regexp_have_distinct_hashes() {
249 let tokens = vec![
250 make_token(TokenKind::NullLiteral),
251 make_token(TokenKind::TemplateLiteral),
252 make_token(TokenKind::RegExpLiteral),
253 ];
254 let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
255 assert_ne!(hashed[0].hash, hashed[1].hash);
256 assert_ne!(hashed[1].hash, hashed[2].hash);
257 assert_ne!(hashed[0].hash, hashed[2].hash);
258 }
259
260 #[test]
261 fn mild_mode_equivalent_to_strict() {
262 let id_tokens = vec![
264 make_token(TokenKind::Identifier("foo".to_string())),
265 make_token(TokenKind::Identifier("bar".to_string())),
266 ];
267 let hashed = normalize_and_hash(&id_tokens, DetectionMode::Mild);
268 assert_ne!(hashed[0].hash, hashed[1].hash);
270
271 let str_tokens = vec![
272 make_token(TokenKind::StringLiteral("hello".to_string())),
273 make_token(TokenKind::StringLiteral("world".to_string())),
274 ];
275 let hashed = normalize_and_hash(&str_tokens, DetectionMode::Mild);
276 assert_ne!(hashed[0].hash, hashed[1].hash);
278
279 let num_tokens = vec![
280 make_token(TokenKind::NumericLiteral("42".to_string())),
281 make_token(TokenKind::NumericLiteral("99".to_string())),
282 ];
283 let hashed = normalize_and_hash(&num_tokens, DetectionMode::Mild);
284 assert_ne!(hashed[0].hash, hashed[1].hash);
286 }
287
288 #[test]
289 fn weak_mode_blinds_strings_only() {
290 let id_tokens = vec![
291 make_token(TokenKind::Identifier("foo".to_string())),
292 make_token(TokenKind::Identifier("bar".to_string())),
293 ];
294 let hashed = normalize_and_hash(&id_tokens, DetectionMode::Weak);
295 assert_ne!(hashed[0].hash, hashed[1].hash, "Weak preserves identifiers");
296
297 let num_tokens = vec![
298 make_token(TokenKind::NumericLiteral("42".to_string())),
299 make_token(TokenKind::NumericLiteral("99".to_string())),
300 ];
301 let hashed = normalize_and_hash(&num_tokens, DetectionMode::Weak);
302 assert_ne!(hashed[0].hash, hashed[1].hash, "Weak preserves numbers");
303 }
304
305 #[test]
306 fn different_token_kinds_produce_distinct_hashes() {
307 let tokens = vec![
309 make_token(TokenKind::Keyword(KeywordType::Const)),
310 make_token(TokenKind::Identifier("x".to_string())),
311 make_token(TokenKind::StringLiteral("x".to_string())),
312 make_token(TokenKind::NumericLiteral("1".to_string())),
313 make_token(TokenKind::BooleanLiteral(true)),
314 make_token(TokenKind::NullLiteral),
315 make_token(TokenKind::TemplateLiteral),
316 make_token(TokenKind::RegExpLiteral),
317 make_token(TokenKind::Operator(OperatorType::Add)),
318 make_token(TokenKind::Punctuation(PunctuationType::OpenParen)),
319 ];
320 let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
321 for i in 0..hashed.len() {
323 for j in (i + 1)..hashed.len() {
324 assert_ne!(
325 hashed[i].hash, hashed[j].hash,
326 "Token at index {i} and {j} should have distinct hashes"
327 );
328 }
329 }
330 }
331
332 #[test]
335 fn resolved_strict_with_ignore_identifiers_override() {
336 let norm = ResolvedNormalization {
338 ignore_identifiers: true,
339 ignore_string_values: false,
340 ignore_numeric_values: false,
341 };
342 let tokens = vec![
343 make_token(TokenKind::Identifier("foo".to_string())),
344 make_token(TokenKind::Identifier("bar".to_string())),
345 ];
346
347 let hashed = normalize_and_hash_resolved(&tokens, norm);
348 assert_eq!(hashed.len(), 2);
349 assert_eq!(hashed[0].hash, hashed[1].hash);
351 }
352
353 #[test]
354 fn resolved_strict_with_ignore_strings_override() {
355 let norm = ResolvedNormalization {
356 ignore_identifiers: false,
357 ignore_string_values: true,
358 ignore_numeric_values: false,
359 };
360 let tokens = vec![
361 make_token(TokenKind::StringLiteral("hello".to_string())),
362 make_token(TokenKind::StringLiteral("world".to_string())),
363 ];
364
365 let hashed = normalize_and_hash_resolved(&tokens, norm);
366 assert_eq!(hashed[0].hash, hashed[1].hash);
367 }
368
369 #[test]
370 fn resolved_strict_with_ignore_numbers_override() {
371 let norm = ResolvedNormalization {
372 ignore_identifiers: false,
373 ignore_string_values: false,
374 ignore_numeric_values: true,
375 };
376 let tokens = vec![
377 make_token(TokenKind::NumericLiteral("42".to_string())),
378 make_token(TokenKind::NumericLiteral("99".to_string())),
379 ];
380
381 let hashed = normalize_and_hash_resolved(&tokens, norm);
382 assert_eq!(hashed[0].hash, hashed[1].hash);
383 }
384
385 #[test]
386 fn resolved_semantic_with_preserve_identifiers_override() {
387 let norm = ResolvedNormalization {
389 ignore_identifiers: false,
390 ignore_string_values: true,
391 ignore_numeric_values: true,
392 };
393 let tokens = vec![
394 make_token(TokenKind::Identifier("foo".to_string())),
395 make_token(TokenKind::Identifier("bar".to_string())),
396 ];
397
398 let hashed = normalize_and_hash_resolved(&tokens, norm);
399 assert_ne!(hashed[0].hash, hashed[1].hash);
401 }
402
403 #[test]
404 fn resolved_normalization_from_mode_defaults() {
405 use fallow_config::NormalizationConfig;
406
407 let norm =
409 ResolvedNormalization::resolve(DetectionMode::Strict, &NormalizationConfig::default());
410 assert!(!norm.ignore_identifiers);
411 assert!(!norm.ignore_string_values);
412 assert!(!norm.ignore_numeric_values);
413
414 let norm =
416 ResolvedNormalization::resolve(DetectionMode::Weak, &NormalizationConfig::default());
417 assert!(!norm.ignore_identifiers);
418 assert!(norm.ignore_string_values);
419 assert!(!norm.ignore_numeric_values);
420
421 let norm = ResolvedNormalization::resolve(
423 DetectionMode::Semantic,
424 &NormalizationConfig::default(),
425 );
426 assert!(norm.ignore_identifiers);
427 assert!(norm.ignore_string_values);
428 assert!(norm.ignore_numeric_values);
429 }
430
431 #[test]
432 fn resolved_normalization_overrides_mode_defaults() {
433 use fallow_config::NormalizationConfig;
434
435 let overrides = NormalizationConfig {
437 ignore_identifiers: Some(true),
438 ignore_string_values: None, ignore_numeric_values: None,
440 };
441 let norm = ResolvedNormalization::resolve(DetectionMode::Strict, &overrides);
442 assert!(norm.ignore_identifiers); assert!(!norm.ignore_string_values); assert!(!norm.ignore_numeric_values); }
446
447 mod proptests {
448 use super::*;
449 use crate::duplicates::tokenize::{KeywordType, OperatorType, PunctuationType};
450 use oxc_span::Span;
451 use proptest::prelude::*;
452
453 fn make_token(kind: TokenKind) -> SourceToken {
454 SourceToken {
455 kind,
456 span: Span::new(0, 0),
457 }
458 }
459
460 fn arb_detection_mode() -> impl Strategy<Value = DetectionMode> {
461 prop::sample::select(vec![
462 DetectionMode::Strict,
463 DetectionMode::Mild,
464 DetectionMode::Weak,
465 DetectionMode::Semantic,
466 ])
467 }
468
469 fn arb_normalization() -> impl Strategy<Value = ResolvedNormalization> {
470 (any::<bool>(), any::<bool>(), any::<bool>()).prop_map(|(ids, strings, nums)| {
471 ResolvedNormalization {
472 ignore_identifiers: ids,
473 ignore_string_values: strings,
474 ignore_numeric_values: nums,
475 }
476 })
477 }
478
479 fn arb_token_kind() -> impl Strategy<Value = TokenKind> {
480 prop_oneof![
481 Just(TokenKind::Keyword(KeywordType::Const)),
482 Just(TokenKind::Keyword(KeywordType::If)),
483 Just(TokenKind::Keyword(KeywordType::Return)),
484 "[a-zA-Z_][a-zA-Z0-9_]{0,30}".prop_map(TokenKind::Identifier),
485 "[a-zA-Z0-9 _.,!?]{0,50}".prop_map(TokenKind::StringLiteral),
486 "[0-9]{1,10}(\\.[0-9]{1,5})?".prop_map(TokenKind::NumericLiteral),
487 any::<bool>().prop_map(TokenKind::BooleanLiteral),
488 Just(TokenKind::NullLiteral),
489 Just(TokenKind::TemplateLiteral),
490 Just(TokenKind::RegExpLiteral),
491 Just(TokenKind::Operator(OperatorType::Add)),
492 Just(TokenKind::Operator(OperatorType::Assign)),
493 Just(TokenKind::Punctuation(PunctuationType::OpenParen)),
494 Just(TokenKind::Punctuation(PunctuationType::CloseParen)),
495 ]
496 }
497
498 proptest! {
499 #[test]
501 fn normalization_is_idempotent(
502 kind in arb_token_kind(),
503 norm in arb_normalization(),
504 ) {
505 let token = make_token(kind);
506 let first = normalize_and_hash_resolved(std::slice::from_ref(&token), norm);
507 let second = normalize_and_hash_resolved(&[token], norm);
510 prop_assert_eq!(first.len(), second.len());
511 for (a, b) in first.iter().zip(second.iter()) {
512 prop_assert_eq!(a.hash, b.hash, "Normalization should be idempotent");
513 }
514 }
515
516 #[test]
518 fn normalization_is_deterministic(
519 kinds in prop::collection::vec(arb_token_kind(), 1..20),
520 mode in arb_detection_mode(),
521 ) {
522 let tokens: Vec<SourceToken> = kinds.into_iter().map(make_token).collect();
523 let result1 = normalize_and_hash(&tokens, mode);
524 let result2 = normalize_and_hash(&tokens, mode);
525 prop_assert_eq!(result1.len(), result2.len());
526 for (a, b) in result1.iter().zip(result2.iter()) {
527 prop_assert_eq!(a.hash, b.hash, "Same input must produce same hash");
528 prop_assert_eq!(a.original_index, b.original_index);
529 }
530 }
531
532 #[test]
534 fn output_length_matches_input(
535 kinds in prop::collection::vec(arb_token_kind(), 0..30),
536 mode in arb_detection_mode(),
537 ) {
538 let tokens: Vec<SourceToken> = kinds.into_iter().map(make_token).collect();
539 let result = normalize_and_hash(&tokens, mode);
540 prop_assert_eq!(
541 result.len(), tokens.len(),
542 "Output should have same length as input"
543 );
544 }
545
546 #[test]
548 fn original_indices_are_sequential(
549 kinds in prop::collection::vec(arb_token_kind(), 1..20),
550 norm in arb_normalization(),
551 ) {
552 let tokens: Vec<SourceToken> = kinds.into_iter().map(make_token).collect();
553 let result = normalize_and_hash_resolved(&tokens, norm);
554 for (i, hashed) in result.iter().enumerate() {
555 prop_assert_eq!(hashed.original_index, i);
556 }
557 }
558 }
559 }
560}