1use xxhash_rust::xxh3::xxh3_64;
2
3use super::tokenize::{SourceToken, TokenKind};
4use fallow_config::{DetectionMode, NormalizationConfig, ResolvedNormalization};
5
6#[derive(Debug, Clone)]
8pub struct HashedToken {
9 pub hash: u64,
11 pub original_index: usize,
13}
14
15#[must_use]
20pub fn normalize_and_hash(tokens: &[SourceToken], mode: DetectionMode) -> Vec<HashedToken> {
21 let resolved = ResolvedNormalization::resolve(mode, &NormalizationConfig::default());
22 normalize_and_hash_resolved(tokens, resolved)
23}
24
25#[must_use]
29pub fn normalize_and_hash_resolved(
30 tokens: &[SourceToken],
31 normalization: ResolvedNormalization,
32) -> Vec<HashedToken> {
33 let mut result = Vec::with_capacity(tokens.len());
34 let mut namespace = "js";
35
36 for (i, token) in tokens.iter().enumerate() {
37 if let TokenKind::Boundary(name) = &token.kind {
38 namespace = name;
39 }
40 let hash = hash_token_resolved(&token.kind, normalization, namespace);
41 result.push(HashedToken {
42 hash,
43 original_index: i,
44 });
45 }
46
47 result
48}
49
50fn hash_token_resolved(kind: &TokenKind, norm: ResolvedNormalization, namespace: &str) -> u64 {
52 match kind {
53 TokenKind::Keyword(kw) => hash_namespaced(namespace, &[0, *kw as u8]),
54 TokenKind::Identifier(name) => {
55 if norm.ignore_identifiers {
56 hash_namespaced(namespace, &[1, 0])
57 } else {
58 let mut buf = vec![1];
59 buf.extend_from_slice(name.as_bytes());
60 hash_namespaced(namespace, &buf)
61 }
62 }
63 TokenKind::StringLiteral(val) => {
64 if norm.ignore_string_values {
65 hash_namespaced(namespace, &[2, 0])
66 } else {
67 let mut buf = vec![2];
68 buf.extend_from_slice(val.as_bytes());
69 hash_namespaced(namespace, &buf)
70 }
71 }
72 TokenKind::NumericLiteral(val) => {
73 if norm.ignore_numeric_values {
74 hash_namespaced(namespace, &[3, 0])
75 } else {
76 let mut buf = vec![3];
77 buf.extend_from_slice(val.as_bytes());
78 hash_namespaced(namespace, &buf)
79 }
80 }
81 TokenKind::BooleanLiteral(val) => hash_namespaced(namespace, &[4, u8::from(*val)]),
82 TokenKind::NullLiteral => hash_namespaced(namespace, &[5]),
83 TokenKind::TemplateLiteral => hash_namespaced(namespace, &[6]),
84 TokenKind::RegExpLiteral => hash_namespaced(namespace, &[7]),
85 TokenKind::Operator(op) => hash_namespaced(namespace, &[8, *op as u8]),
86 TokenKind::Punctuation(p) => hash_namespaced(namespace, &[9, *p as u8]),
87 TokenKind::Boundary(name) => {
88 let mut buf = vec![10];
89 buf.extend_from_slice(name.as_bytes());
90 hash_bytes(&buf)
91 }
92 }
93}
94
95fn hash_namespaced(namespace: &str, data: &[u8]) -> u64 {
96 let mut buf = Vec::with_capacity(namespace.len() + 1 + data.len());
97 buf.extend_from_slice(namespace.as_bytes());
98 buf.push(0);
99 buf.extend_from_slice(data);
100 hash_bytes(&buf)
101}
102
103fn hash_bytes(data: &[u8]) -> u64 {
105 xxh3_64(data)
106}
107
108#[cfg(test)]
109mod tests {
110 use super::*;
111 use crate::duplicates::tokenize::{KeywordType, OperatorType, PunctuationType};
112 use oxc_span::Span;
113
114 fn make_token(kind: TokenKind) -> SourceToken {
115 SourceToken {
116 kind,
117 span: Span::new(0, 0),
118 }
119 }
120
121 #[test]
122 fn strict_mode_preserves_identifiers() {
123 let tokens = vec![
124 make_token(TokenKind::Identifier("foo".to_string())),
125 make_token(TokenKind::Identifier("bar".to_string())),
126 ];
127
128 let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
129 assert_eq!(hashed.len(), 2);
130 assert_ne!(hashed[0].hash, hashed[1].hash);
131 }
132
133 #[test]
134 fn semantic_mode_blinds_identifiers() {
135 let tokens = vec![
136 make_token(TokenKind::Identifier("foo".to_string())),
137 make_token(TokenKind::Identifier("bar".to_string())),
138 ];
139
140 let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
141 assert_eq!(hashed.len(), 2);
142 assert_eq!(hashed[0].hash, hashed[1].hash);
143 }
144
145 #[test]
146 fn semantic_mode_blinds_string_literals() {
147 let tokens = vec![
148 make_token(TokenKind::StringLiteral("hello".to_string())),
149 make_token(TokenKind::StringLiteral("world".to_string())),
150 ];
151
152 let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
153 assert_eq!(hashed.len(), 2);
154 assert_eq!(hashed[0].hash, hashed[1].hash);
155 }
156
157 #[test]
158 fn semantic_mode_blinds_numeric_literals() {
159 let tokens = vec![
160 make_token(TokenKind::NumericLiteral("42".to_string())),
161 make_token(TokenKind::NumericLiteral("99".to_string())),
162 ];
163
164 let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
165 assert_eq!(hashed.len(), 2);
166 assert_eq!(hashed[0].hash, hashed[1].hash);
167 }
168
169 #[test]
170 fn semantic_mode_preserves_booleans() {
171 let tokens = vec![
172 make_token(TokenKind::BooleanLiteral(true)),
173 make_token(TokenKind::BooleanLiteral(false)),
174 ];
175
176 let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
177 assert_eq!(hashed.len(), 2);
178 assert_ne!(hashed[0].hash, hashed[1].hash);
179 }
180
181 #[test]
182 fn semantic_mode_preserves_keywords() {
183 let tokens = vec![
184 make_token(TokenKind::Keyword(KeywordType::If)),
185 make_token(TokenKind::Keyword(KeywordType::While)),
186 ];
187
188 let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
189 assert_eq!(hashed.len(), 2);
190 assert_ne!(hashed[0].hash, hashed[1].hash);
191 }
192
193 #[test]
194 fn preserves_original_indices() {
195 let tokens = vec![
196 make_token(TokenKind::Keyword(KeywordType::Const)),
197 make_token(TokenKind::Identifier("x".to_string())),
198 make_token(TokenKind::Operator(OperatorType::Assign)),
199 ];
200
201 let hashed = normalize_and_hash(&tokens, DetectionMode::Mild);
202 assert_eq!(hashed.len(), 3);
203 assert_eq!(hashed[0].original_index, 0);
204 assert_eq!(hashed[1].original_index, 1);
205 assert_eq!(hashed[2].original_index, 2);
206 }
207
208 #[test]
209 fn empty_input_produces_empty_output() {
210 let tokens: Vec<SourceToken> = vec![];
211 let hashed = normalize_and_hash(&tokens, DetectionMode::Mild);
212 assert!(hashed.is_empty());
213 }
214
215 #[test]
216 fn operators_have_distinct_hashes() {
217 let tokens = vec![
218 make_token(TokenKind::Operator(OperatorType::Add)),
219 make_token(TokenKind::Operator(OperatorType::Sub)),
220 ];
221
222 let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
223 assert_ne!(hashed[0].hash, hashed[1].hash);
224 }
225
226 #[test]
227 fn punctuation_has_distinct_hashes() {
228 let tokens = vec![
229 make_token(TokenKind::Punctuation(PunctuationType::OpenParen)),
230 make_token(TokenKind::Punctuation(PunctuationType::CloseParen)),
231 ];
232
233 let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
234 assert_ne!(hashed[0].hash, hashed[1].hash);
235 }
236
237 #[test]
238 fn null_literal_has_stable_hash() {
239 let tokens = vec![make_token(TokenKind::NullLiteral)];
240 let h1 = normalize_and_hash(&tokens, DetectionMode::Strict);
241 let h2 = normalize_and_hash(&tokens, DetectionMode::Semantic);
242 assert_eq!(h1[0].hash, h2[0].hash);
243 }
244
245 #[test]
246 fn template_literal_has_stable_hash() {
247 let tokens = vec![make_token(TokenKind::TemplateLiteral)];
248 let h1 = normalize_and_hash(&tokens, DetectionMode::Strict);
249 let h2 = normalize_and_hash(&tokens, DetectionMode::Semantic);
250 assert_eq!(h1[0].hash, h2[0].hash);
251 }
252
253 #[test]
254 fn regexp_literal_has_stable_hash() {
255 let tokens = vec![make_token(TokenKind::RegExpLiteral)];
256 let h1 = normalize_and_hash(&tokens, DetectionMode::Strict);
257 let h2 = normalize_and_hash(&tokens, DetectionMode::Semantic);
258 assert_eq!(h1[0].hash, h2[0].hash);
259 }
260
261 #[test]
262 fn null_template_regexp_have_distinct_hashes() {
263 let tokens = vec![
264 make_token(TokenKind::NullLiteral),
265 make_token(TokenKind::TemplateLiteral),
266 make_token(TokenKind::RegExpLiteral),
267 ];
268 let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
269 assert_ne!(hashed[0].hash, hashed[1].hash);
270 assert_ne!(hashed[1].hash, hashed[2].hash);
271 assert_ne!(hashed[0].hash, hashed[2].hash);
272 }
273
274 #[test]
275 fn mild_mode_equivalent_to_strict() {
276 let id_tokens = vec![
277 make_token(TokenKind::Identifier("foo".to_string())),
278 make_token(TokenKind::Identifier("bar".to_string())),
279 ];
280 let hashed = normalize_and_hash(&id_tokens, DetectionMode::Mild);
281 assert_ne!(hashed[0].hash, hashed[1].hash);
282
283 let str_tokens = vec![
284 make_token(TokenKind::StringLiteral("hello".to_string())),
285 make_token(TokenKind::StringLiteral("world".to_string())),
286 ];
287 let hashed = normalize_and_hash(&str_tokens, DetectionMode::Mild);
288 assert_ne!(hashed[0].hash, hashed[1].hash);
289
290 let num_tokens = vec![
291 make_token(TokenKind::NumericLiteral("42".to_string())),
292 make_token(TokenKind::NumericLiteral("99".to_string())),
293 ];
294 let hashed = normalize_and_hash(&num_tokens, DetectionMode::Mild);
295 assert_ne!(hashed[0].hash, hashed[1].hash);
296 }
297
298 #[test]
299 fn weak_mode_blinds_strings_only() {
300 let id_tokens = vec![
301 make_token(TokenKind::Identifier("foo".to_string())),
302 make_token(TokenKind::Identifier("bar".to_string())),
303 ];
304 let hashed = normalize_and_hash(&id_tokens, DetectionMode::Weak);
305 assert_ne!(hashed[0].hash, hashed[1].hash, "Weak preserves identifiers");
306
307 let num_tokens = vec![
308 make_token(TokenKind::NumericLiteral("42".to_string())),
309 make_token(TokenKind::NumericLiteral("99".to_string())),
310 ];
311 let hashed = normalize_and_hash(&num_tokens, DetectionMode::Weak);
312 assert_ne!(hashed[0].hash, hashed[1].hash, "Weak preserves numbers");
313 }
314
315 #[test]
316 fn different_token_kinds_produce_distinct_hashes() {
317 let tokens = vec![
318 make_token(TokenKind::Keyword(KeywordType::Const)),
319 make_token(TokenKind::Identifier("x".to_string())),
320 make_token(TokenKind::StringLiteral("x".to_string())),
321 make_token(TokenKind::NumericLiteral("1".to_string())),
322 make_token(TokenKind::BooleanLiteral(true)),
323 make_token(TokenKind::NullLiteral),
324 make_token(TokenKind::TemplateLiteral),
325 make_token(TokenKind::RegExpLiteral),
326 make_token(TokenKind::Operator(OperatorType::Add)),
327 make_token(TokenKind::Punctuation(PunctuationType::OpenParen)),
328 ];
329 let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
330 for i in 0..hashed.len() {
331 for j in (i + 1)..hashed.len() {
332 assert_ne!(
333 hashed[i].hash, hashed[j].hash,
334 "Token at index {i} and {j} should have distinct hashes"
335 );
336 }
337 }
338 }
339
340 #[test]
341 fn resolved_strict_with_ignore_identifiers_override() {
342 let norm = ResolvedNormalization {
343 ignore_identifiers: true,
344 ignore_string_values: false,
345 ignore_numeric_values: false,
346 };
347 let tokens = vec![
348 make_token(TokenKind::Identifier("foo".to_string())),
349 make_token(TokenKind::Identifier("bar".to_string())),
350 ];
351
352 let hashed = normalize_and_hash_resolved(&tokens, norm);
353 assert_eq!(hashed.len(), 2);
354 assert_eq!(hashed[0].hash, hashed[1].hash);
355 }
356
357 #[test]
358 fn resolved_strict_with_ignore_strings_override() {
359 let norm = ResolvedNormalization {
360 ignore_identifiers: false,
361 ignore_string_values: true,
362 ignore_numeric_values: false,
363 };
364 let tokens = vec![
365 make_token(TokenKind::StringLiteral("hello".to_string())),
366 make_token(TokenKind::StringLiteral("world".to_string())),
367 ];
368
369 let hashed = normalize_and_hash_resolved(&tokens, norm);
370 assert_eq!(hashed[0].hash, hashed[1].hash);
371 }
372
373 #[test]
374 fn resolved_strict_with_ignore_numbers_override() {
375 let norm = ResolvedNormalization {
376 ignore_identifiers: false,
377 ignore_string_values: false,
378 ignore_numeric_values: true,
379 };
380 let tokens = vec![
381 make_token(TokenKind::NumericLiteral("42".to_string())),
382 make_token(TokenKind::NumericLiteral("99".to_string())),
383 ];
384
385 let hashed = normalize_and_hash_resolved(&tokens, norm);
386 assert_eq!(hashed[0].hash, hashed[1].hash);
387 }
388
389 #[test]
390 fn resolved_semantic_with_preserve_identifiers_override() {
391 let norm = ResolvedNormalization {
392 ignore_identifiers: false,
393 ignore_string_values: true,
394 ignore_numeric_values: true,
395 };
396 let tokens = vec![
397 make_token(TokenKind::Identifier("foo".to_string())),
398 make_token(TokenKind::Identifier("bar".to_string())),
399 ];
400
401 let hashed = normalize_and_hash_resolved(&tokens, norm);
402 assert_ne!(hashed[0].hash, hashed[1].hash);
403 }
404
405 #[test]
406 fn resolved_normalization_from_mode_defaults() {
407 use fallow_config::NormalizationConfig;
408
409 let norm =
410 ResolvedNormalization::resolve(DetectionMode::Strict, &NormalizationConfig::default());
411 assert!(!norm.ignore_identifiers);
412 assert!(!norm.ignore_string_values);
413 assert!(!norm.ignore_numeric_values);
414
415 let norm =
416 ResolvedNormalization::resolve(DetectionMode::Weak, &NormalizationConfig::default());
417 assert!(!norm.ignore_identifiers);
418 assert!(norm.ignore_string_values);
419 assert!(!norm.ignore_numeric_values);
420
421 let norm = ResolvedNormalization::resolve(
422 DetectionMode::Semantic,
423 &NormalizationConfig::default(),
424 );
425 assert!(norm.ignore_identifiers);
426 assert!(norm.ignore_string_values);
427 assert!(norm.ignore_numeric_values);
428 }
429
430 #[test]
431 fn resolved_normalization_overrides_mode_defaults() {
432 use fallow_config::NormalizationConfig;
433
434 let overrides = NormalizationConfig {
435 ignore_identifiers: Some(true),
436 ignore_string_values: None,
437 ignore_numeric_values: None,
438 };
439 let norm = ResolvedNormalization::resolve(DetectionMode::Strict, &overrides);
440 assert!(norm.ignore_identifiers);
441 assert!(!norm.ignore_string_values);
442 assert!(!norm.ignore_numeric_values);
443 }
444
445 mod proptests {
446 use super::*;
447 use crate::duplicates::tokenize::{KeywordType, OperatorType, PunctuationType};
448 use oxc_span::Span;
449 use proptest::prelude::*;
450
451 fn make_token(kind: TokenKind) -> SourceToken {
452 SourceToken {
453 kind,
454 span: Span::new(0, 0),
455 }
456 }
457
458 fn arb_detection_mode() -> impl Strategy<Value = DetectionMode> {
459 prop::sample::select(vec![
460 DetectionMode::Strict,
461 DetectionMode::Mild,
462 DetectionMode::Weak,
463 DetectionMode::Semantic,
464 ])
465 }
466
467 fn arb_normalization() -> impl Strategy<Value = ResolvedNormalization> {
468 (any::<bool>(), any::<bool>(), any::<bool>()).prop_map(|(ids, strings, nums)| {
469 ResolvedNormalization {
470 ignore_identifiers: ids,
471 ignore_string_values: strings,
472 ignore_numeric_values: nums,
473 }
474 })
475 }
476
477 fn arb_token_kind() -> impl Strategy<Value = TokenKind> {
478 prop_oneof![
479 Just(TokenKind::Keyword(KeywordType::Const)),
480 Just(TokenKind::Keyword(KeywordType::If)),
481 Just(TokenKind::Keyword(KeywordType::Return)),
482 "[a-zA-Z_][a-zA-Z0-9_]{0,30}".prop_map(TokenKind::Identifier),
483 "[a-zA-Z0-9 _.,!?]{0,50}".prop_map(TokenKind::StringLiteral),
484 "[0-9]{1,10}(\\.[0-9]{1,5})?".prop_map(TokenKind::NumericLiteral),
485 any::<bool>().prop_map(TokenKind::BooleanLiteral),
486 Just(TokenKind::NullLiteral),
487 Just(TokenKind::TemplateLiteral),
488 Just(TokenKind::RegExpLiteral),
489 Just(TokenKind::Operator(OperatorType::Add)),
490 Just(TokenKind::Operator(OperatorType::Assign)),
491 Just(TokenKind::Punctuation(PunctuationType::OpenParen)),
492 Just(TokenKind::Punctuation(PunctuationType::CloseParen)),
493 ]
494 }
495
496 proptest! {
497 #[test]
499 fn normalization_is_idempotent(
500 kind in arb_token_kind(),
501 norm in arb_normalization(),
502 ) {
503 let token = make_token(kind);
504 let first = normalize_and_hash_resolved(std::slice::from_ref(&token), norm);
505 let second = normalize_and_hash_resolved(&[token], norm);
506 prop_assert_eq!(first.len(), second.len());
507 for (a, b) in first.iter().zip(second.iter()) {
508 prop_assert_eq!(a.hash, b.hash, "Normalization should be idempotent");
509 }
510 }
511
512 #[test]
514 fn normalization_is_deterministic(
515 kinds in prop::collection::vec(arb_token_kind(), 1..20),
516 mode in arb_detection_mode(),
517 ) {
518 let tokens: Vec<SourceToken> = kinds.into_iter().map(make_token).collect();
519 let result1 = normalize_and_hash(&tokens, mode);
520 let result2 = normalize_and_hash(&tokens, mode);
521 prop_assert_eq!(result1.len(), result2.len());
522 for (a, b) in result1.iter().zip(result2.iter()) {
523 prop_assert_eq!(a.hash, b.hash, "Same input must produce same hash");
524 prop_assert_eq!(a.original_index, b.original_index);
525 }
526 }
527
528 #[test]
530 fn output_length_matches_input(
531 kinds in prop::collection::vec(arb_token_kind(), 0..30),
532 mode in arb_detection_mode(),
533 ) {
534 let tokens: Vec<SourceToken> = kinds.into_iter().map(make_token).collect();
535 let result = normalize_and_hash(&tokens, mode);
536 prop_assert_eq!(
537 result.len(), tokens.len(),
538 "Output should have same length as input"
539 );
540 }
541
542 #[test]
544 fn original_indices_are_sequential(
545 kinds in prop::collection::vec(arb_token_kind(), 1..20),
546 norm in arb_normalization(),
547 ) {
548 let tokens: Vec<SourceToken> = kinds.into_iter().map(make_token).collect();
549 let result = normalize_and_hash_resolved(&tokens, norm);
550 for (i, hashed) in result.iter().enumerate() {
551 prop_assert_eq!(hashed.original_index, i);
552 }
553 }
554 }
555 }
556}