Skip to main content

fallow_core/duplicates/
normalize.rs

1use xxhash_rust::xxh3::xxh3_64;
2
3use super::tokenize::{SourceToken, TokenKind};
4use fallow_config::{DetectionMode, NormalizationConfig, ResolvedNormalization};
5
6/// A token with a precomputed hash for use in the detection engine.
7#[derive(Debug, Clone)]
8pub struct HashedToken {
9    /// Hash of the normalized token.
10    pub hash: u64,
11    /// Index of this token in the original (pre-normalization) token sequence.
12    pub original_index: usize,
13}
14
15/// Normalize and hash a token sequence according to the detection mode.
16///
17/// Returns a vector of `HashedToken` values ready for the Rabin-Karp sliding window.
18/// Tokens that should be skipped (based on mode) are excluded from the output.
19pub fn normalize_and_hash(tokens: &[SourceToken], mode: DetectionMode) -> Vec<HashedToken> {
20    let resolved = ResolvedNormalization::resolve(mode, &NormalizationConfig::default());
21    normalize_and_hash_resolved(tokens, resolved)
22}
23
24/// Normalize and hash with explicit resolved normalization flags.
25///
26/// This is the primary normalization entry point when using configurable overrides.
27pub fn normalize_and_hash_resolved(
28    tokens: &[SourceToken],
29    normalization: ResolvedNormalization,
30) -> Vec<HashedToken> {
31    let mut result = Vec::with_capacity(tokens.len());
32
33    for (i, token) in tokens.iter().enumerate() {
34        let hash = hash_token_resolved(&token.kind, normalization);
35        result.push(HashedToken {
36            hash,
37            original_index: i,
38        });
39    }
40
41    result
42}
43
44/// Hash a single token using resolved normalization flags.
45fn hash_token_resolved(kind: &TokenKind, norm: ResolvedNormalization) -> u64 {
46    match kind {
47        TokenKind::Keyword(kw) => hash_bytes(&[0, *kw as u8]),
48        TokenKind::Identifier(name) => {
49            if norm.ignore_identifiers {
50                hash_bytes(&[1, 0])
51            } else {
52                let mut buf = vec![1];
53                buf.extend_from_slice(name.as_bytes());
54                hash_bytes(&buf)
55            }
56        }
57        TokenKind::StringLiteral(val) => {
58            if norm.ignore_string_values {
59                hash_bytes(&[2, 0])
60            } else {
61                let mut buf = vec![2];
62                buf.extend_from_slice(val.as_bytes());
63                hash_bytes(&buf)
64            }
65        }
66        TokenKind::NumericLiteral(val) => {
67            if norm.ignore_numeric_values {
68                hash_bytes(&[3, 0])
69            } else {
70                let mut buf = vec![3];
71                buf.extend_from_slice(val.as_bytes());
72                hash_bytes(&buf)
73            }
74        }
75        TokenKind::BooleanLiteral(val) => hash_bytes(&[4, u8::from(*val)]),
76        TokenKind::NullLiteral => hash_bytes(&[5]),
77        TokenKind::TemplateLiteral => hash_bytes(&[6]),
78        TokenKind::RegExpLiteral => hash_bytes(&[7]),
79        TokenKind::Operator(op) => hash_bytes(&[8, *op as u8]),
80        TokenKind::Punctuation(p) => hash_bytes(&[9, *p as u8]),
81    }
82}
83
84/// Hash a byte slice using xxh3.
85fn hash_bytes(data: &[u8]) -> u64 {
86    xxh3_64(data)
87}
88
89#[cfg(test)]
90mod tests {
91    use super::*;
92    use crate::duplicates::tokenize::{KeywordType, OperatorType, PunctuationType};
93    use oxc_span::Span;
94
95    fn make_token(kind: TokenKind) -> SourceToken {
96        SourceToken {
97            kind,
98            span: Span::new(0, 0),
99        }
100    }
101
102    #[test]
103    fn strict_mode_preserves_identifiers() {
104        let tokens = vec![
105            make_token(TokenKind::Identifier("foo".to_string())),
106            make_token(TokenKind::Identifier("bar".to_string())),
107        ];
108
109        let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
110        assert_eq!(hashed.len(), 2);
111        // Different identifiers should have different hashes in strict mode
112        assert_ne!(hashed[0].hash, hashed[1].hash);
113    }
114
115    #[test]
116    fn semantic_mode_blinds_identifiers() {
117        let tokens = vec![
118            make_token(TokenKind::Identifier("foo".to_string())),
119            make_token(TokenKind::Identifier("bar".to_string())),
120        ];
121
122        let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
123        assert_eq!(hashed.len(), 2);
124        // Different identifiers should have the SAME hash in semantic mode
125        assert_eq!(hashed[0].hash, hashed[1].hash);
126    }
127
128    #[test]
129    fn semantic_mode_blinds_string_literals() {
130        let tokens = vec![
131            make_token(TokenKind::StringLiteral("hello".to_string())),
132            make_token(TokenKind::StringLiteral("world".to_string())),
133        ];
134
135        let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
136        assert_eq!(hashed.len(), 2);
137        assert_eq!(hashed[0].hash, hashed[1].hash);
138    }
139
140    #[test]
141    fn semantic_mode_blinds_numeric_literals() {
142        let tokens = vec![
143            make_token(TokenKind::NumericLiteral("42".to_string())),
144            make_token(TokenKind::NumericLiteral("99".to_string())),
145        ];
146
147        let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
148        assert_eq!(hashed.len(), 2);
149        assert_eq!(hashed[0].hash, hashed[1].hash);
150    }
151
152    #[test]
153    fn semantic_mode_preserves_booleans() {
154        let tokens = vec![
155            make_token(TokenKind::BooleanLiteral(true)),
156            make_token(TokenKind::BooleanLiteral(false)),
157        ];
158
159        let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
160        assert_eq!(hashed.len(), 2);
161        assert_ne!(hashed[0].hash, hashed[1].hash);
162    }
163
164    #[test]
165    fn semantic_mode_preserves_keywords() {
166        let tokens = vec![
167            make_token(TokenKind::Keyword(KeywordType::If)),
168            make_token(TokenKind::Keyword(KeywordType::While)),
169        ];
170
171        let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
172        assert_eq!(hashed.len(), 2);
173        assert_ne!(hashed[0].hash, hashed[1].hash);
174    }
175
176    #[test]
177    fn preserves_original_indices() {
178        let tokens = vec![
179            make_token(TokenKind::Keyword(KeywordType::Const)),
180            make_token(TokenKind::Identifier("x".to_string())),
181            make_token(TokenKind::Operator(OperatorType::Assign)),
182        ];
183
184        let hashed = normalize_and_hash(&tokens, DetectionMode::Mild);
185        assert_eq!(hashed.len(), 3);
186        assert_eq!(hashed[0].original_index, 0);
187        assert_eq!(hashed[1].original_index, 1);
188        assert_eq!(hashed[2].original_index, 2);
189    }
190
191    #[test]
192    fn empty_input_produces_empty_output() {
193        let tokens: Vec<SourceToken> = vec![];
194        let hashed = normalize_and_hash(&tokens, DetectionMode::Mild);
195        assert!(hashed.is_empty());
196    }
197
198    #[test]
199    fn operators_have_distinct_hashes() {
200        let tokens = vec![
201            make_token(TokenKind::Operator(OperatorType::Add)),
202            make_token(TokenKind::Operator(OperatorType::Sub)),
203        ];
204
205        let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
206        assert_ne!(hashed[0].hash, hashed[1].hash);
207    }
208
209    #[test]
210    fn punctuation_has_distinct_hashes() {
211        let tokens = vec![
212            make_token(TokenKind::Punctuation(PunctuationType::OpenParen)),
213            make_token(TokenKind::Punctuation(PunctuationType::CloseParen)),
214        ];
215
216        let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
217        assert_ne!(hashed[0].hash, hashed[1].hash);
218    }
219
220    // ── Literal token type tests ─────────────────────────────────
221
222    #[test]
223    fn null_literal_has_stable_hash() {
224        let tokens = vec![make_token(TokenKind::NullLiteral)];
225        let h1 = normalize_and_hash(&tokens, DetectionMode::Strict);
226        let h2 = normalize_and_hash(&tokens, DetectionMode::Semantic);
227        // NullLiteral has no value to normalize, so hash should be same across modes
228        assert_eq!(h1[0].hash, h2[0].hash);
229    }
230
231    #[test]
232    fn template_literal_has_stable_hash() {
233        let tokens = vec![make_token(TokenKind::TemplateLiteral)];
234        let h1 = normalize_and_hash(&tokens, DetectionMode::Strict);
235        let h2 = normalize_and_hash(&tokens, DetectionMode::Semantic);
236        assert_eq!(h1[0].hash, h2[0].hash);
237    }
238
239    #[test]
240    fn regexp_literal_has_stable_hash() {
241        let tokens = vec![make_token(TokenKind::RegExpLiteral)];
242        let h1 = normalize_and_hash(&tokens, DetectionMode::Strict);
243        let h2 = normalize_and_hash(&tokens, DetectionMode::Semantic);
244        assert_eq!(h1[0].hash, h2[0].hash);
245    }
246
247    #[test]
248    fn null_template_regexp_have_distinct_hashes() {
249        let tokens = vec![
250            make_token(TokenKind::NullLiteral),
251            make_token(TokenKind::TemplateLiteral),
252            make_token(TokenKind::RegExpLiteral),
253        ];
254        let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
255        assert_ne!(hashed[0].hash, hashed[1].hash);
256        assert_ne!(hashed[1].hash, hashed[2].hash);
257        assert_ne!(hashed[0].hash, hashed[2].hash);
258    }
259
260    #[test]
261    fn mild_mode_equivalent_to_strict() {
262        // Mild mode is equivalent to Strict for AST-based tokenization (both preserve all values)
263        let id_tokens = vec![
264            make_token(TokenKind::Identifier("foo".to_string())),
265            make_token(TokenKind::Identifier("bar".to_string())),
266        ];
267        let hashed = normalize_and_hash(&id_tokens, DetectionMode::Mild);
268        // Identifiers preserved in Mild mode
269        assert_ne!(hashed[0].hash, hashed[1].hash);
270
271        let str_tokens = vec![
272            make_token(TokenKind::StringLiteral("hello".to_string())),
273            make_token(TokenKind::StringLiteral("world".to_string())),
274        ];
275        let hashed = normalize_and_hash(&str_tokens, DetectionMode::Mild);
276        // Strings preserved in Mild mode (same as Strict)
277        assert_ne!(hashed[0].hash, hashed[1].hash);
278
279        let num_tokens = vec![
280            make_token(TokenKind::NumericLiteral("42".to_string())),
281            make_token(TokenKind::NumericLiteral("99".to_string())),
282        ];
283        let hashed = normalize_and_hash(&num_tokens, DetectionMode::Mild);
284        // Numbers preserved in Mild mode
285        assert_ne!(hashed[0].hash, hashed[1].hash);
286    }
287
288    #[test]
289    fn weak_mode_blinds_strings_only() {
290        let id_tokens = vec![
291            make_token(TokenKind::Identifier("foo".to_string())),
292            make_token(TokenKind::Identifier("bar".to_string())),
293        ];
294        let hashed = normalize_and_hash(&id_tokens, DetectionMode::Weak);
295        assert_ne!(hashed[0].hash, hashed[1].hash, "Weak preserves identifiers");
296
297        let num_tokens = vec![
298            make_token(TokenKind::NumericLiteral("42".to_string())),
299            make_token(TokenKind::NumericLiteral("99".to_string())),
300        ];
301        let hashed = normalize_and_hash(&num_tokens, DetectionMode::Weak);
302        assert_ne!(hashed[0].hash, hashed[1].hash, "Weak preserves numbers");
303    }
304
305    #[test]
306    fn different_token_kinds_produce_distinct_hashes() {
307        // All distinct token kinds with same inner value where applicable
308        let tokens = vec![
309            make_token(TokenKind::Keyword(KeywordType::Const)),
310            make_token(TokenKind::Identifier("x".to_string())),
311            make_token(TokenKind::StringLiteral("x".to_string())),
312            make_token(TokenKind::NumericLiteral("1".to_string())),
313            make_token(TokenKind::BooleanLiteral(true)),
314            make_token(TokenKind::NullLiteral),
315            make_token(TokenKind::TemplateLiteral),
316            make_token(TokenKind::RegExpLiteral),
317            make_token(TokenKind::Operator(OperatorType::Add)),
318            make_token(TokenKind::Punctuation(PunctuationType::OpenParen)),
319        ];
320        let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
321        // Each pair should have distinct hashes (different kind discriminant byte)
322        for i in 0..hashed.len() {
323            for j in (i + 1)..hashed.len() {
324                assert_ne!(
325                    hashed[i].hash, hashed[j].hash,
326                    "Token at index {i} and {j} should have distinct hashes"
327                );
328            }
329        }
330    }
331
332    // ── Configurable normalization tests ──────────────────────────
333
334    #[test]
335    fn resolved_strict_with_ignore_identifiers_override() {
336        // Strict mode normally preserves identifiers, but override blinds them
337        let norm = ResolvedNormalization {
338            ignore_identifiers: true,
339            ignore_string_values: false,
340            ignore_numeric_values: false,
341        };
342        let tokens = vec![
343            make_token(TokenKind::Identifier("foo".to_string())),
344            make_token(TokenKind::Identifier("bar".to_string())),
345        ];
346
347        let hashed = normalize_and_hash_resolved(&tokens, norm);
348        assert_eq!(hashed.len(), 2);
349        // Identifiers should be blinded
350        assert_eq!(hashed[0].hash, hashed[1].hash);
351    }
352
353    #[test]
354    fn resolved_strict_with_ignore_strings_override() {
355        let norm = ResolvedNormalization {
356            ignore_identifiers: false,
357            ignore_string_values: true,
358            ignore_numeric_values: false,
359        };
360        let tokens = vec![
361            make_token(TokenKind::StringLiteral("hello".to_string())),
362            make_token(TokenKind::StringLiteral("world".to_string())),
363        ];
364
365        let hashed = normalize_and_hash_resolved(&tokens, norm);
366        assert_eq!(hashed[0].hash, hashed[1].hash);
367    }
368
369    #[test]
370    fn resolved_strict_with_ignore_numbers_override() {
371        let norm = ResolvedNormalization {
372            ignore_identifiers: false,
373            ignore_string_values: false,
374            ignore_numeric_values: true,
375        };
376        let tokens = vec![
377            make_token(TokenKind::NumericLiteral("42".to_string())),
378            make_token(TokenKind::NumericLiteral("99".to_string())),
379        ];
380
381        let hashed = normalize_and_hash_resolved(&tokens, norm);
382        assert_eq!(hashed[0].hash, hashed[1].hash);
383    }
384
385    #[test]
386    fn resolved_semantic_with_preserve_identifiers_override() {
387        // Semantic mode normally blinds identifiers, but override preserves them
388        let norm = ResolvedNormalization {
389            ignore_identifiers: false,
390            ignore_string_values: true,
391            ignore_numeric_values: true,
392        };
393        let tokens = vec![
394            make_token(TokenKind::Identifier("foo".to_string())),
395            make_token(TokenKind::Identifier("bar".to_string())),
396        ];
397
398        let hashed = normalize_and_hash_resolved(&tokens, norm);
399        // Identifiers should be preserved (different hashes)
400        assert_ne!(hashed[0].hash, hashed[1].hash);
401    }
402
403    #[test]
404    fn resolved_normalization_from_mode_defaults() {
405        use fallow_config::NormalizationConfig;
406
407        // Strict mode defaults: preserve everything
408        let norm =
409            ResolvedNormalization::resolve(DetectionMode::Strict, &NormalizationConfig::default());
410        assert!(!norm.ignore_identifiers);
411        assert!(!norm.ignore_string_values);
412        assert!(!norm.ignore_numeric_values);
413
414        // Weak mode defaults: blind strings only
415        let norm =
416            ResolvedNormalization::resolve(DetectionMode::Weak, &NormalizationConfig::default());
417        assert!(!norm.ignore_identifiers);
418        assert!(norm.ignore_string_values);
419        assert!(!norm.ignore_numeric_values);
420
421        // Semantic mode defaults: blind all
422        let norm = ResolvedNormalization::resolve(
423            DetectionMode::Semantic,
424            &NormalizationConfig::default(),
425        );
426        assert!(norm.ignore_identifiers);
427        assert!(norm.ignore_string_values);
428        assert!(norm.ignore_numeric_values);
429    }
430
431    #[test]
432    fn resolved_normalization_overrides_mode_defaults() {
433        use fallow_config::NormalizationConfig;
434
435        // Strict mode with explicit override to blind identifiers
436        let overrides = NormalizationConfig {
437            ignore_identifiers: Some(true),
438            ignore_string_values: None, // Use mode default (false)
439            ignore_numeric_values: None,
440        };
441        let norm = ResolvedNormalization::resolve(DetectionMode::Strict, &overrides);
442        assert!(norm.ignore_identifiers); // Overridden
443        assert!(!norm.ignore_string_values); // Mode default
444        assert!(!norm.ignore_numeric_values); // Mode default
445    }
446
447    mod proptests {
448        use super::*;
449        use crate::duplicates::tokenize::{KeywordType, OperatorType, PunctuationType};
450        use oxc_span::Span;
451        use proptest::prelude::*;
452
453        fn make_token(kind: TokenKind) -> SourceToken {
454            SourceToken {
455                kind,
456                span: Span::new(0, 0),
457            }
458        }
459
460        fn arb_detection_mode() -> impl Strategy<Value = DetectionMode> {
461            prop::sample::select(vec![
462                DetectionMode::Strict,
463                DetectionMode::Mild,
464                DetectionMode::Weak,
465                DetectionMode::Semantic,
466            ])
467        }
468
469        fn arb_normalization() -> impl Strategy<Value = ResolvedNormalization> {
470            (any::<bool>(), any::<bool>(), any::<bool>()).prop_map(|(ids, strings, nums)| {
471                ResolvedNormalization {
472                    ignore_identifiers: ids,
473                    ignore_string_values: strings,
474                    ignore_numeric_values: nums,
475                }
476            })
477        }
478
479        fn arb_token_kind() -> impl Strategy<Value = TokenKind> {
480            prop_oneof![
481                Just(TokenKind::Keyword(KeywordType::Const)),
482                Just(TokenKind::Keyword(KeywordType::If)),
483                Just(TokenKind::Keyword(KeywordType::Return)),
484                "[a-zA-Z_][a-zA-Z0-9_]{0,30}".prop_map(TokenKind::Identifier),
485                "[a-zA-Z0-9 _.,!?]{0,50}".prop_map(TokenKind::StringLiteral),
486                "[0-9]{1,10}(\\.[0-9]{1,5})?".prop_map(TokenKind::NumericLiteral),
487                any::<bool>().prop_map(TokenKind::BooleanLiteral),
488                Just(TokenKind::NullLiteral),
489                Just(TokenKind::TemplateLiteral),
490                Just(TokenKind::RegExpLiteral),
491                Just(TokenKind::Operator(OperatorType::Add)),
492                Just(TokenKind::Operator(OperatorType::Assign)),
493                Just(TokenKind::Punctuation(PunctuationType::OpenParen)),
494                Just(TokenKind::Punctuation(PunctuationType::CloseParen)),
495            ]
496        }
497
498        proptest! {
499            /// Normalizing a token twice produces the same result as normalizing once (idempotency).
500            #[test]
501            fn normalization_is_idempotent(
502                kind in arb_token_kind(),
503                norm in arb_normalization(),
504            ) {
505                let token = make_token(kind);
506                let first = normalize_and_hash_resolved(std::slice::from_ref(&token), norm);
507                // The hash is computed directly from the token kind + normalization flags.
508                // Running it again on the same input must yield the same hash.
509                let second = normalize_and_hash_resolved(&[token], norm);
510                prop_assert_eq!(first.len(), second.len());
511                for (a, b) in first.iter().zip(second.iter()) {
512                    prop_assert_eq!(a.hash, b.hash, "Normalization should be idempotent");
513                }
514            }
515
516            /// Same input always produces the same output (determinism).
517            #[test]
518            fn normalization_is_deterministic(
519                kinds in prop::collection::vec(arb_token_kind(), 1..20),
520                mode in arb_detection_mode(),
521            ) {
522                let tokens: Vec<SourceToken> = kinds.into_iter().map(make_token).collect();
523                let result1 = normalize_and_hash(&tokens, mode);
524                let result2 = normalize_and_hash(&tokens, mode);
525                prop_assert_eq!(result1.len(), result2.len());
526                for (a, b) in result1.iter().zip(result2.iter()) {
527                    prop_assert_eq!(a.hash, b.hash, "Same input must produce same hash");
528                    prop_assert_eq!(a.original_index, b.original_index);
529                }
530            }
531
532            /// Output length always equals input length (no tokens are filtered).
533            #[test]
534            fn output_length_matches_input(
535                kinds in prop::collection::vec(arb_token_kind(), 0..30),
536                mode in arb_detection_mode(),
537            ) {
538                let tokens: Vec<SourceToken> = kinds.into_iter().map(make_token).collect();
539                let result = normalize_and_hash(&tokens, mode);
540                prop_assert_eq!(
541                    result.len(), tokens.len(),
542                    "Output should have same length as input"
543                );
544            }
545
546            /// Original indices should be sequential 0..n.
547            #[test]
548            fn original_indices_are_sequential(
549                kinds in prop::collection::vec(arb_token_kind(), 1..20),
550                norm in arb_normalization(),
551            ) {
552                let tokens: Vec<SourceToken> = kinds.into_iter().map(make_token).collect();
553                let result = normalize_and_hash_resolved(&tokens, norm);
554                for (i, hashed) in result.iter().enumerate() {
555                    prop_assert_eq!(hashed.original_index, i);
556                }
557            }
558        }
559    }
560}