1use xxhash_rust::xxh3::xxh3_64;
2
3use super::tokenize::{SourceToken, TokenKind};
4use fallow_config::{DetectionMode, NormalizationConfig, ResolvedNormalization};
5
6#[derive(Debug, Clone)]
8pub struct HashedToken {
9 pub hash: u64,
11 pub original_index: usize,
13}
14
15#[must_use]
20pub fn normalize_and_hash(tokens: &[SourceToken], mode: DetectionMode) -> Vec<HashedToken> {
21 let resolved = ResolvedNormalization::resolve(mode, &NormalizationConfig::default());
22 normalize_and_hash_resolved(tokens, resolved)
23}
24
25#[must_use]
29pub fn normalize_and_hash_resolved(
30 tokens: &[SourceToken],
31 normalization: ResolvedNormalization,
32) -> Vec<HashedToken> {
33 let mut result = Vec::with_capacity(tokens.len());
34
35 for (i, token) in tokens.iter().enumerate() {
36 let hash = hash_token_resolved(&token.kind, normalization);
37 result.push(HashedToken {
38 hash,
39 original_index: i,
40 });
41 }
42
43 result
44}
45
46fn hash_token_resolved(kind: &TokenKind, norm: ResolvedNormalization) -> u64 {
48 match kind {
49 TokenKind::Keyword(kw) => hash_bytes(&[0, *kw as u8]),
50 TokenKind::Identifier(name) => {
51 if norm.ignore_identifiers {
52 hash_bytes(&[1, 0])
53 } else {
54 let mut buf = vec![1];
55 buf.extend_from_slice(name.as_bytes());
56 hash_bytes(&buf)
57 }
58 }
59 TokenKind::StringLiteral(val) => {
60 if norm.ignore_string_values {
61 hash_bytes(&[2, 0])
62 } else {
63 let mut buf = vec![2];
64 buf.extend_from_slice(val.as_bytes());
65 hash_bytes(&buf)
66 }
67 }
68 TokenKind::NumericLiteral(val) => {
69 if norm.ignore_numeric_values {
70 hash_bytes(&[3, 0])
71 } else {
72 let mut buf = vec![3];
73 buf.extend_from_slice(val.as_bytes());
74 hash_bytes(&buf)
75 }
76 }
77 TokenKind::BooleanLiteral(val) => hash_bytes(&[4, u8::from(*val)]),
78 TokenKind::NullLiteral => hash_bytes(&[5]),
79 TokenKind::TemplateLiteral => hash_bytes(&[6]),
80 TokenKind::RegExpLiteral => hash_bytes(&[7]),
81 TokenKind::Operator(op) => hash_bytes(&[8, *op as u8]),
82 TokenKind::Punctuation(p) => hash_bytes(&[9, *p as u8]),
83 }
84}
85
86fn hash_bytes(data: &[u8]) -> u64 {
88 xxh3_64(data)
89}
90
91#[cfg(test)]
92mod tests {
93 use super::*;
94 use crate::duplicates::tokenize::{KeywordType, OperatorType, PunctuationType};
95 use oxc_span::Span;
96
97 fn make_token(kind: TokenKind) -> SourceToken {
98 SourceToken {
99 kind,
100 span: Span::new(0, 0),
101 }
102 }
103
104 #[test]
105 fn strict_mode_preserves_identifiers() {
106 let tokens = vec![
107 make_token(TokenKind::Identifier("foo".to_string())),
108 make_token(TokenKind::Identifier("bar".to_string())),
109 ];
110
111 let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
112 assert_eq!(hashed.len(), 2);
113 assert_ne!(hashed[0].hash, hashed[1].hash);
114 }
115
116 #[test]
117 fn semantic_mode_blinds_identifiers() {
118 let tokens = vec![
119 make_token(TokenKind::Identifier("foo".to_string())),
120 make_token(TokenKind::Identifier("bar".to_string())),
121 ];
122
123 let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
124 assert_eq!(hashed.len(), 2);
125 assert_eq!(hashed[0].hash, hashed[1].hash);
126 }
127
128 #[test]
129 fn semantic_mode_blinds_string_literals() {
130 let tokens = vec![
131 make_token(TokenKind::StringLiteral("hello".to_string())),
132 make_token(TokenKind::StringLiteral("world".to_string())),
133 ];
134
135 let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
136 assert_eq!(hashed.len(), 2);
137 assert_eq!(hashed[0].hash, hashed[1].hash);
138 }
139
140 #[test]
141 fn semantic_mode_blinds_numeric_literals() {
142 let tokens = vec![
143 make_token(TokenKind::NumericLiteral("42".to_string())),
144 make_token(TokenKind::NumericLiteral("99".to_string())),
145 ];
146
147 let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
148 assert_eq!(hashed.len(), 2);
149 assert_eq!(hashed[0].hash, hashed[1].hash);
150 }
151
152 #[test]
153 fn semantic_mode_preserves_booleans() {
154 let tokens = vec![
155 make_token(TokenKind::BooleanLiteral(true)),
156 make_token(TokenKind::BooleanLiteral(false)),
157 ];
158
159 let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
160 assert_eq!(hashed.len(), 2);
161 assert_ne!(hashed[0].hash, hashed[1].hash);
162 }
163
164 #[test]
165 fn semantic_mode_preserves_keywords() {
166 let tokens = vec![
167 make_token(TokenKind::Keyword(KeywordType::If)),
168 make_token(TokenKind::Keyword(KeywordType::While)),
169 ];
170
171 let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
172 assert_eq!(hashed.len(), 2);
173 assert_ne!(hashed[0].hash, hashed[1].hash);
174 }
175
176 #[test]
177 fn preserves_original_indices() {
178 let tokens = vec![
179 make_token(TokenKind::Keyword(KeywordType::Const)),
180 make_token(TokenKind::Identifier("x".to_string())),
181 make_token(TokenKind::Operator(OperatorType::Assign)),
182 ];
183
184 let hashed = normalize_and_hash(&tokens, DetectionMode::Mild);
185 assert_eq!(hashed.len(), 3);
186 assert_eq!(hashed[0].original_index, 0);
187 assert_eq!(hashed[1].original_index, 1);
188 assert_eq!(hashed[2].original_index, 2);
189 }
190
191 #[test]
192 fn empty_input_produces_empty_output() {
193 let tokens: Vec<SourceToken> = vec![];
194 let hashed = normalize_and_hash(&tokens, DetectionMode::Mild);
195 assert!(hashed.is_empty());
196 }
197
198 #[test]
199 fn operators_have_distinct_hashes() {
200 let tokens = vec![
201 make_token(TokenKind::Operator(OperatorType::Add)),
202 make_token(TokenKind::Operator(OperatorType::Sub)),
203 ];
204
205 let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
206 assert_ne!(hashed[0].hash, hashed[1].hash);
207 }
208
209 #[test]
210 fn punctuation_has_distinct_hashes() {
211 let tokens = vec![
212 make_token(TokenKind::Punctuation(PunctuationType::OpenParen)),
213 make_token(TokenKind::Punctuation(PunctuationType::CloseParen)),
214 ];
215
216 let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
217 assert_ne!(hashed[0].hash, hashed[1].hash);
218 }
219
220 #[test]
221 fn null_literal_has_stable_hash() {
222 let tokens = vec![make_token(TokenKind::NullLiteral)];
223 let h1 = normalize_and_hash(&tokens, DetectionMode::Strict);
224 let h2 = normalize_and_hash(&tokens, DetectionMode::Semantic);
225 assert_eq!(h1[0].hash, h2[0].hash);
226 }
227
228 #[test]
229 fn template_literal_has_stable_hash() {
230 let tokens = vec![make_token(TokenKind::TemplateLiteral)];
231 let h1 = normalize_and_hash(&tokens, DetectionMode::Strict);
232 let h2 = normalize_and_hash(&tokens, DetectionMode::Semantic);
233 assert_eq!(h1[0].hash, h2[0].hash);
234 }
235
236 #[test]
237 fn regexp_literal_has_stable_hash() {
238 let tokens = vec![make_token(TokenKind::RegExpLiteral)];
239 let h1 = normalize_and_hash(&tokens, DetectionMode::Strict);
240 let h2 = normalize_and_hash(&tokens, DetectionMode::Semantic);
241 assert_eq!(h1[0].hash, h2[0].hash);
242 }
243
244 #[test]
245 fn null_template_regexp_have_distinct_hashes() {
246 let tokens = vec![
247 make_token(TokenKind::NullLiteral),
248 make_token(TokenKind::TemplateLiteral),
249 make_token(TokenKind::RegExpLiteral),
250 ];
251 let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
252 assert_ne!(hashed[0].hash, hashed[1].hash);
253 assert_ne!(hashed[1].hash, hashed[2].hash);
254 assert_ne!(hashed[0].hash, hashed[2].hash);
255 }
256
257 #[test]
258 fn mild_mode_equivalent_to_strict() {
259 let id_tokens = vec![
260 make_token(TokenKind::Identifier("foo".to_string())),
261 make_token(TokenKind::Identifier("bar".to_string())),
262 ];
263 let hashed = normalize_and_hash(&id_tokens, DetectionMode::Mild);
264 assert_ne!(hashed[0].hash, hashed[1].hash);
265
266 let str_tokens = vec![
267 make_token(TokenKind::StringLiteral("hello".to_string())),
268 make_token(TokenKind::StringLiteral("world".to_string())),
269 ];
270 let hashed = normalize_and_hash(&str_tokens, DetectionMode::Mild);
271 assert_ne!(hashed[0].hash, hashed[1].hash);
272
273 let num_tokens = vec![
274 make_token(TokenKind::NumericLiteral("42".to_string())),
275 make_token(TokenKind::NumericLiteral("99".to_string())),
276 ];
277 let hashed = normalize_and_hash(&num_tokens, DetectionMode::Mild);
278 assert_ne!(hashed[0].hash, hashed[1].hash);
279 }
280
281 #[test]
282 fn weak_mode_blinds_strings_only() {
283 let id_tokens = vec![
284 make_token(TokenKind::Identifier("foo".to_string())),
285 make_token(TokenKind::Identifier("bar".to_string())),
286 ];
287 let hashed = normalize_and_hash(&id_tokens, DetectionMode::Weak);
288 assert_ne!(hashed[0].hash, hashed[1].hash, "Weak preserves identifiers");
289
290 let num_tokens = vec![
291 make_token(TokenKind::NumericLiteral("42".to_string())),
292 make_token(TokenKind::NumericLiteral("99".to_string())),
293 ];
294 let hashed = normalize_and_hash(&num_tokens, DetectionMode::Weak);
295 assert_ne!(hashed[0].hash, hashed[1].hash, "Weak preserves numbers");
296 }
297
298 #[test]
299 fn different_token_kinds_produce_distinct_hashes() {
300 let tokens = vec![
301 make_token(TokenKind::Keyword(KeywordType::Const)),
302 make_token(TokenKind::Identifier("x".to_string())),
303 make_token(TokenKind::StringLiteral("x".to_string())),
304 make_token(TokenKind::NumericLiteral("1".to_string())),
305 make_token(TokenKind::BooleanLiteral(true)),
306 make_token(TokenKind::NullLiteral),
307 make_token(TokenKind::TemplateLiteral),
308 make_token(TokenKind::RegExpLiteral),
309 make_token(TokenKind::Operator(OperatorType::Add)),
310 make_token(TokenKind::Punctuation(PunctuationType::OpenParen)),
311 ];
312 let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
313 for i in 0..hashed.len() {
314 for j in (i + 1)..hashed.len() {
315 assert_ne!(
316 hashed[i].hash, hashed[j].hash,
317 "Token at index {i} and {j} should have distinct hashes"
318 );
319 }
320 }
321 }
322
323 #[test]
324 fn resolved_strict_with_ignore_identifiers_override() {
325 let norm = ResolvedNormalization {
326 ignore_identifiers: true,
327 ignore_string_values: false,
328 ignore_numeric_values: false,
329 };
330 let tokens = vec![
331 make_token(TokenKind::Identifier("foo".to_string())),
332 make_token(TokenKind::Identifier("bar".to_string())),
333 ];
334
335 let hashed = normalize_and_hash_resolved(&tokens, norm);
336 assert_eq!(hashed.len(), 2);
337 assert_eq!(hashed[0].hash, hashed[1].hash);
338 }
339
340 #[test]
341 fn resolved_strict_with_ignore_strings_override() {
342 let norm = ResolvedNormalization {
343 ignore_identifiers: false,
344 ignore_string_values: true,
345 ignore_numeric_values: false,
346 };
347 let tokens = vec![
348 make_token(TokenKind::StringLiteral("hello".to_string())),
349 make_token(TokenKind::StringLiteral("world".to_string())),
350 ];
351
352 let hashed = normalize_and_hash_resolved(&tokens, norm);
353 assert_eq!(hashed[0].hash, hashed[1].hash);
354 }
355
356 #[test]
357 fn resolved_strict_with_ignore_numbers_override() {
358 let norm = ResolvedNormalization {
359 ignore_identifiers: false,
360 ignore_string_values: false,
361 ignore_numeric_values: true,
362 };
363 let tokens = vec![
364 make_token(TokenKind::NumericLiteral("42".to_string())),
365 make_token(TokenKind::NumericLiteral("99".to_string())),
366 ];
367
368 let hashed = normalize_and_hash_resolved(&tokens, norm);
369 assert_eq!(hashed[0].hash, hashed[1].hash);
370 }
371
372 #[test]
373 fn resolved_semantic_with_preserve_identifiers_override() {
374 let norm = ResolvedNormalization {
375 ignore_identifiers: false,
376 ignore_string_values: true,
377 ignore_numeric_values: true,
378 };
379 let tokens = vec![
380 make_token(TokenKind::Identifier("foo".to_string())),
381 make_token(TokenKind::Identifier("bar".to_string())),
382 ];
383
384 let hashed = normalize_and_hash_resolved(&tokens, norm);
385 assert_ne!(hashed[0].hash, hashed[1].hash);
386 }
387
388 #[test]
389 fn resolved_normalization_from_mode_defaults() {
390 use fallow_config::NormalizationConfig;
391
392 let norm =
393 ResolvedNormalization::resolve(DetectionMode::Strict, &NormalizationConfig::default());
394 assert!(!norm.ignore_identifiers);
395 assert!(!norm.ignore_string_values);
396 assert!(!norm.ignore_numeric_values);
397
398 let norm =
399 ResolvedNormalization::resolve(DetectionMode::Weak, &NormalizationConfig::default());
400 assert!(!norm.ignore_identifiers);
401 assert!(norm.ignore_string_values);
402 assert!(!norm.ignore_numeric_values);
403
404 let norm = ResolvedNormalization::resolve(
405 DetectionMode::Semantic,
406 &NormalizationConfig::default(),
407 );
408 assert!(norm.ignore_identifiers);
409 assert!(norm.ignore_string_values);
410 assert!(norm.ignore_numeric_values);
411 }
412
413 #[test]
414 fn resolved_normalization_overrides_mode_defaults() {
415 use fallow_config::NormalizationConfig;
416
417 let overrides = NormalizationConfig {
418 ignore_identifiers: Some(true),
419 ignore_string_values: None,
420 ignore_numeric_values: None,
421 };
422 let norm = ResolvedNormalization::resolve(DetectionMode::Strict, &overrides);
423 assert!(norm.ignore_identifiers);
424 assert!(!norm.ignore_string_values);
425 assert!(!norm.ignore_numeric_values);
426 }
427
428 mod proptests {
429 use super::*;
430 use crate::duplicates::tokenize::{KeywordType, OperatorType, PunctuationType};
431 use oxc_span::Span;
432 use proptest::prelude::*;
433
434 fn make_token(kind: TokenKind) -> SourceToken {
435 SourceToken {
436 kind,
437 span: Span::new(0, 0),
438 }
439 }
440
441 fn arb_detection_mode() -> impl Strategy<Value = DetectionMode> {
442 prop::sample::select(vec![
443 DetectionMode::Strict,
444 DetectionMode::Mild,
445 DetectionMode::Weak,
446 DetectionMode::Semantic,
447 ])
448 }
449
450 fn arb_normalization() -> impl Strategy<Value = ResolvedNormalization> {
451 (any::<bool>(), any::<bool>(), any::<bool>()).prop_map(|(ids, strings, nums)| {
452 ResolvedNormalization {
453 ignore_identifiers: ids,
454 ignore_string_values: strings,
455 ignore_numeric_values: nums,
456 }
457 })
458 }
459
460 fn arb_token_kind() -> impl Strategy<Value = TokenKind> {
461 prop_oneof![
462 Just(TokenKind::Keyword(KeywordType::Const)),
463 Just(TokenKind::Keyword(KeywordType::If)),
464 Just(TokenKind::Keyword(KeywordType::Return)),
465 "[a-zA-Z_][a-zA-Z0-9_]{0,30}".prop_map(TokenKind::Identifier),
466 "[a-zA-Z0-9 _.,!?]{0,50}".prop_map(TokenKind::StringLiteral),
467 "[0-9]{1,10}(\\.[0-9]{1,5})?".prop_map(TokenKind::NumericLiteral),
468 any::<bool>().prop_map(TokenKind::BooleanLiteral),
469 Just(TokenKind::NullLiteral),
470 Just(TokenKind::TemplateLiteral),
471 Just(TokenKind::RegExpLiteral),
472 Just(TokenKind::Operator(OperatorType::Add)),
473 Just(TokenKind::Operator(OperatorType::Assign)),
474 Just(TokenKind::Punctuation(PunctuationType::OpenParen)),
475 Just(TokenKind::Punctuation(PunctuationType::CloseParen)),
476 ]
477 }
478
479 proptest! {
480 #[test]
482 fn normalization_is_idempotent(
483 kind in arb_token_kind(),
484 norm in arb_normalization(),
485 ) {
486 let token = make_token(kind);
487 let first = normalize_and_hash_resolved(std::slice::from_ref(&token), norm);
488 let second = normalize_and_hash_resolved(&[token], norm);
489 prop_assert_eq!(first.len(), second.len());
490 for (a, b) in first.iter().zip(second.iter()) {
491 prop_assert_eq!(a.hash, b.hash, "Normalization should be idempotent");
492 }
493 }
494
495 #[test]
497 fn normalization_is_deterministic(
498 kinds in prop::collection::vec(arb_token_kind(), 1..20),
499 mode in arb_detection_mode(),
500 ) {
501 let tokens: Vec<SourceToken> = kinds.into_iter().map(make_token).collect();
502 let result1 = normalize_and_hash(&tokens, mode);
503 let result2 = normalize_and_hash(&tokens, mode);
504 prop_assert_eq!(result1.len(), result2.len());
505 for (a, b) in result1.iter().zip(result2.iter()) {
506 prop_assert_eq!(a.hash, b.hash, "Same input must produce same hash");
507 prop_assert_eq!(a.original_index, b.original_index);
508 }
509 }
510
511 #[test]
513 fn output_length_matches_input(
514 kinds in prop::collection::vec(arb_token_kind(), 0..30),
515 mode in arb_detection_mode(),
516 ) {
517 let tokens: Vec<SourceToken> = kinds.into_iter().map(make_token).collect();
518 let result = normalize_and_hash(&tokens, mode);
519 prop_assert_eq!(
520 result.len(), tokens.len(),
521 "Output should have same length as input"
522 );
523 }
524
525 #[test]
527 fn original_indices_are_sequential(
528 kinds in prop::collection::vec(arb_token_kind(), 1..20),
529 norm in arb_normalization(),
530 ) {
531 let tokens: Vec<SourceToken> = kinds.into_iter().map(make_token).collect();
532 let result = normalize_and_hash_resolved(&tokens, norm);
533 for (i, hashed) in result.iter().enumerate() {
534 prop_assert_eq!(hashed.original_index, i);
535 }
536 }
537 }
538 }
539}