Skip to main content

fallow_core/duplicates/
normalize.rs

1use xxhash_rust::xxh3::xxh3_64;
2
3use super::tokenize::{SourceToken, TokenKind};
4use fallow_config::{DetectionMode, NormalizationConfig, ResolvedNormalization};
5
6/// A token with a precomputed hash for use in the detection engine.
7#[derive(Debug, Clone)]
8pub struct HashedToken {
9    /// Hash of the normalized token.
10    pub hash: u64,
11    /// Index of this token in the original (pre-normalization) token sequence.
12    pub original_index: usize,
13}
14
15/// Normalize and hash a token sequence according to the detection mode.
16///
17/// Returns a vector of `HashedToken` values ready for the Rabin-Karp sliding window.
18/// Tokens that should be skipped (based on mode) are excluded from the output.
19#[must_use]
20pub fn normalize_and_hash(tokens: &[SourceToken], mode: DetectionMode) -> Vec<HashedToken> {
21    let resolved = ResolvedNormalization::resolve(mode, &NormalizationConfig::default());
22    normalize_and_hash_resolved(tokens, resolved)
23}
24
25/// Normalize and hash with explicit resolved normalization flags.
26///
27/// This is the primary normalization entry point when using configurable overrides.
28#[must_use]
29pub fn normalize_and_hash_resolved(
30    tokens: &[SourceToken],
31    normalization: ResolvedNormalization,
32) -> Vec<HashedToken> {
33    let mut result = Vec::with_capacity(tokens.len());
34
35    for (i, token) in tokens.iter().enumerate() {
36        let hash = hash_token_resolved(&token.kind, normalization);
37        result.push(HashedToken {
38            hash,
39            original_index: i,
40        });
41    }
42
43    result
44}
45
46/// Hash a single token using resolved normalization flags.
47fn hash_token_resolved(kind: &TokenKind, norm: ResolvedNormalization) -> u64 {
48    match kind {
49        TokenKind::Keyword(kw) => hash_bytes(&[0, *kw as u8]),
50        TokenKind::Identifier(name) => {
51            if norm.ignore_identifiers {
52                hash_bytes(&[1, 0])
53            } else {
54                let mut buf = vec![1];
55                buf.extend_from_slice(name.as_bytes());
56                hash_bytes(&buf)
57            }
58        }
59        TokenKind::StringLiteral(val) => {
60            if norm.ignore_string_values {
61                hash_bytes(&[2, 0])
62            } else {
63                let mut buf = vec![2];
64                buf.extend_from_slice(val.as_bytes());
65                hash_bytes(&buf)
66            }
67        }
68        TokenKind::NumericLiteral(val) => {
69            if norm.ignore_numeric_values {
70                hash_bytes(&[3, 0])
71            } else {
72                let mut buf = vec![3];
73                buf.extend_from_slice(val.as_bytes());
74                hash_bytes(&buf)
75            }
76        }
77        TokenKind::BooleanLiteral(val) => hash_bytes(&[4, u8::from(*val)]),
78        TokenKind::NullLiteral => hash_bytes(&[5]),
79        TokenKind::TemplateLiteral => hash_bytes(&[6]),
80        TokenKind::RegExpLiteral => hash_bytes(&[7]),
81        TokenKind::Operator(op) => hash_bytes(&[8, *op as u8]),
82        TokenKind::Punctuation(p) => hash_bytes(&[9, *p as u8]),
83    }
84}
85
86/// Hash a byte slice using xxh3.
87fn hash_bytes(data: &[u8]) -> u64 {
88    xxh3_64(data)
89}
90
91#[cfg(test)]
92mod tests {
93    use super::*;
94    use crate::duplicates::tokenize::{KeywordType, OperatorType, PunctuationType};
95    use oxc_span::Span;
96
97    fn make_token(kind: TokenKind) -> SourceToken {
98        SourceToken {
99            kind,
100            span: Span::new(0, 0),
101        }
102    }
103
104    #[test]
105    fn strict_mode_preserves_identifiers() {
106        let tokens = vec![
107            make_token(TokenKind::Identifier("foo".to_string())),
108            make_token(TokenKind::Identifier("bar".to_string())),
109        ];
110
111        let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
112        assert_eq!(hashed.len(), 2);
113        // Different identifiers should have different hashes in strict mode
114        assert_ne!(hashed[0].hash, hashed[1].hash);
115    }
116
117    #[test]
118    fn semantic_mode_blinds_identifiers() {
119        let tokens = vec![
120            make_token(TokenKind::Identifier("foo".to_string())),
121            make_token(TokenKind::Identifier("bar".to_string())),
122        ];
123
124        let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
125        assert_eq!(hashed.len(), 2);
126        // Different identifiers should have the SAME hash in semantic mode
127        assert_eq!(hashed[0].hash, hashed[1].hash);
128    }
129
130    #[test]
131    fn semantic_mode_blinds_string_literals() {
132        let tokens = vec![
133            make_token(TokenKind::StringLiteral("hello".to_string())),
134            make_token(TokenKind::StringLiteral("world".to_string())),
135        ];
136
137        let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
138        assert_eq!(hashed.len(), 2);
139        assert_eq!(hashed[0].hash, hashed[1].hash);
140    }
141
142    #[test]
143    fn semantic_mode_blinds_numeric_literals() {
144        let tokens = vec![
145            make_token(TokenKind::NumericLiteral("42".to_string())),
146            make_token(TokenKind::NumericLiteral("99".to_string())),
147        ];
148
149        let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
150        assert_eq!(hashed.len(), 2);
151        assert_eq!(hashed[0].hash, hashed[1].hash);
152    }
153
154    #[test]
155    fn semantic_mode_preserves_booleans() {
156        let tokens = vec![
157            make_token(TokenKind::BooleanLiteral(true)),
158            make_token(TokenKind::BooleanLiteral(false)),
159        ];
160
161        let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
162        assert_eq!(hashed.len(), 2);
163        assert_ne!(hashed[0].hash, hashed[1].hash);
164    }
165
166    #[test]
167    fn semantic_mode_preserves_keywords() {
168        let tokens = vec![
169            make_token(TokenKind::Keyword(KeywordType::If)),
170            make_token(TokenKind::Keyword(KeywordType::While)),
171        ];
172
173        let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
174        assert_eq!(hashed.len(), 2);
175        assert_ne!(hashed[0].hash, hashed[1].hash);
176    }
177
178    #[test]
179    fn preserves_original_indices() {
180        let tokens = vec![
181            make_token(TokenKind::Keyword(KeywordType::Const)),
182            make_token(TokenKind::Identifier("x".to_string())),
183            make_token(TokenKind::Operator(OperatorType::Assign)),
184        ];
185
186        let hashed = normalize_and_hash(&tokens, DetectionMode::Mild);
187        assert_eq!(hashed.len(), 3);
188        assert_eq!(hashed[0].original_index, 0);
189        assert_eq!(hashed[1].original_index, 1);
190        assert_eq!(hashed[2].original_index, 2);
191    }
192
193    #[test]
194    fn empty_input_produces_empty_output() {
195        let tokens: Vec<SourceToken> = vec![];
196        let hashed = normalize_and_hash(&tokens, DetectionMode::Mild);
197        assert!(hashed.is_empty());
198    }
199
200    #[test]
201    fn operators_have_distinct_hashes() {
202        let tokens = vec![
203            make_token(TokenKind::Operator(OperatorType::Add)),
204            make_token(TokenKind::Operator(OperatorType::Sub)),
205        ];
206
207        let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
208        assert_ne!(hashed[0].hash, hashed[1].hash);
209    }
210
211    #[test]
212    fn punctuation_has_distinct_hashes() {
213        let tokens = vec![
214            make_token(TokenKind::Punctuation(PunctuationType::OpenParen)),
215            make_token(TokenKind::Punctuation(PunctuationType::CloseParen)),
216        ];
217
218        let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
219        assert_ne!(hashed[0].hash, hashed[1].hash);
220    }
221
222    // ── Literal token type tests ─────────────────────────────────
223
224    #[test]
225    fn null_literal_has_stable_hash() {
226        let tokens = vec![make_token(TokenKind::NullLiteral)];
227        let h1 = normalize_and_hash(&tokens, DetectionMode::Strict);
228        let h2 = normalize_and_hash(&tokens, DetectionMode::Semantic);
229        // NullLiteral has no value to normalize, so hash should be same across modes
230        assert_eq!(h1[0].hash, h2[0].hash);
231    }
232
233    #[test]
234    fn template_literal_has_stable_hash() {
235        let tokens = vec![make_token(TokenKind::TemplateLiteral)];
236        let h1 = normalize_and_hash(&tokens, DetectionMode::Strict);
237        let h2 = normalize_and_hash(&tokens, DetectionMode::Semantic);
238        assert_eq!(h1[0].hash, h2[0].hash);
239    }
240
241    #[test]
242    fn regexp_literal_has_stable_hash() {
243        let tokens = vec![make_token(TokenKind::RegExpLiteral)];
244        let h1 = normalize_and_hash(&tokens, DetectionMode::Strict);
245        let h2 = normalize_and_hash(&tokens, DetectionMode::Semantic);
246        assert_eq!(h1[0].hash, h2[0].hash);
247    }
248
249    #[test]
250    fn null_template_regexp_have_distinct_hashes() {
251        let tokens = vec![
252            make_token(TokenKind::NullLiteral),
253            make_token(TokenKind::TemplateLiteral),
254            make_token(TokenKind::RegExpLiteral),
255        ];
256        let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
257        assert_ne!(hashed[0].hash, hashed[1].hash);
258        assert_ne!(hashed[1].hash, hashed[2].hash);
259        assert_ne!(hashed[0].hash, hashed[2].hash);
260    }
261
262    #[test]
263    fn mild_mode_equivalent_to_strict() {
264        // Mild mode is equivalent to Strict for AST-based tokenization (both preserve all values)
265        let id_tokens = vec![
266            make_token(TokenKind::Identifier("foo".to_string())),
267            make_token(TokenKind::Identifier("bar".to_string())),
268        ];
269        let hashed = normalize_and_hash(&id_tokens, DetectionMode::Mild);
270        // Identifiers preserved in Mild mode
271        assert_ne!(hashed[0].hash, hashed[1].hash);
272
273        let str_tokens = vec![
274            make_token(TokenKind::StringLiteral("hello".to_string())),
275            make_token(TokenKind::StringLiteral("world".to_string())),
276        ];
277        let hashed = normalize_and_hash(&str_tokens, DetectionMode::Mild);
278        // Strings preserved in Mild mode (same as Strict)
279        assert_ne!(hashed[0].hash, hashed[1].hash);
280
281        let num_tokens = vec![
282            make_token(TokenKind::NumericLiteral("42".to_string())),
283            make_token(TokenKind::NumericLiteral("99".to_string())),
284        ];
285        let hashed = normalize_and_hash(&num_tokens, DetectionMode::Mild);
286        // Numbers preserved in Mild mode
287        assert_ne!(hashed[0].hash, hashed[1].hash);
288    }
289
290    #[test]
291    fn weak_mode_blinds_strings_only() {
292        let id_tokens = vec![
293            make_token(TokenKind::Identifier("foo".to_string())),
294            make_token(TokenKind::Identifier("bar".to_string())),
295        ];
296        let hashed = normalize_and_hash(&id_tokens, DetectionMode::Weak);
297        assert_ne!(hashed[0].hash, hashed[1].hash, "Weak preserves identifiers");
298
299        let num_tokens = vec![
300            make_token(TokenKind::NumericLiteral("42".to_string())),
301            make_token(TokenKind::NumericLiteral("99".to_string())),
302        ];
303        let hashed = normalize_and_hash(&num_tokens, DetectionMode::Weak);
304        assert_ne!(hashed[0].hash, hashed[1].hash, "Weak preserves numbers");
305    }
306
307    #[test]
308    fn different_token_kinds_produce_distinct_hashes() {
309        // All distinct token kinds with same inner value where applicable
310        let tokens = vec![
311            make_token(TokenKind::Keyword(KeywordType::Const)),
312            make_token(TokenKind::Identifier("x".to_string())),
313            make_token(TokenKind::StringLiteral("x".to_string())),
314            make_token(TokenKind::NumericLiteral("1".to_string())),
315            make_token(TokenKind::BooleanLiteral(true)),
316            make_token(TokenKind::NullLiteral),
317            make_token(TokenKind::TemplateLiteral),
318            make_token(TokenKind::RegExpLiteral),
319            make_token(TokenKind::Operator(OperatorType::Add)),
320            make_token(TokenKind::Punctuation(PunctuationType::OpenParen)),
321        ];
322        let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
323        // Each pair should have distinct hashes (different kind discriminant byte)
324        for i in 0..hashed.len() {
325            for j in (i + 1)..hashed.len() {
326                assert_ne!(
327                    hashed[i].hash, hashed[j].hash,
328                    "Token at index {i} and {j} should have distinct hashes"
329                );
330            }
331        }
332    }
333
334    // ── Configurable normalization tests ──────────────────────────
335
336    #[test]
337    fn resolved_strict_with_ignore_identifiers_override() {
338        // Strict mode normally preserves identifiers, but override blinds them
339        let norm = ResolvedNormalization {
340            ignore_identifiers: true,
341            ignore_string_values: false,
342            ignore_numeric_values: false,
343        };
344        let tokens = vec![
345            make_token(TokenKind::Identifier("foo".to_string())),
346            make_token(TokenKind::Identifier("bar".to_string())),
347        ];
348
349        let hashed = normalize_and_hash_resolved(&tokens, norm);
350        assert_eq!(hashed.len(), 2);
351        // Identifiers should be blinded
352        assert_eq!(hashed[0].hash, hashed[1].hash);
353    }
354
355    #[test]
356    fn resolved_strict_with_ignore_strings_override() {
357        let norm = ResolvedNormalization {
358            ignore_identifiers: false,
359            ignore_string_values: true,
360            ignore_numeric_values: false,
361        };
362        let tokens = vec![
363            make_token(TokenKind::StringLiteral("hello".to_string())),
364            make_token(TokenKind::StringLiteral("world".to_string())),
365        ];
366
367        let hashed = normalize_and_hash_resolved(&tokens, norm);
368        assert_eq!(hashed[0].hash, hashed[1].hash);
369    }
370
371    #[test]
372    fn resolved_strict_with_ignore_numbers_override() {
373        let norm = ResolvedNormalization {
374            ignore_identifiers: false,
375            ignore_string_values: false,
376            ignore_numeric_values: true,
377        };
378        let tokens = vec![
379            make_token(TokenKind::NumericLiteral("42".to_string())),
380            make_token(TokenKind::NumericLiteral("99".to_string())),
381        ];
382
383        let hashed = normalize_and_hash_resolved(&tokens, norm);
384        assert_eq!(hashed[0].hash, hashed[1].hash);
385    }
386
387    #[test]
388    fn resolved_semantic_with_preserve_identifiers_override() {
389        // Semantic mode normally blinds identifiers, but override preserves them
390        let norm = ResolvedNormalization {
391            ignore_identifiers: false,
392            ignore_string_values: true,
393            ignore_numeric_values: true,
394        };
395        let tokens = vec![
396            make_token(TokenKind::Identifier("foo".to_string())),
397            make_token(TokenKind::Identifier("bar".to_string())),
398        ];
399
400        let hashed = normalize_and_hash_resolved(&tokens, norm);
401        // Identifiers should be preserved (different hashes)
402        assert_ne!(hashed[0].hash, hashed[1].hash);
403    }
404
405    #[test]
406    fn resolved_normalization_from_mode_defaults() {
407        use fallow_config::NormalizationConfig;
408
409        // Strict mode defaults: preserve everything
410        let norm =
411            ResolvedNormalization::resolve(DetectionMode::Strict, &NormalizationConfig::default());
412        assert!(!norm.ignore_identifiers);
413        assert!(!norm.ignore_string_values);
414        assert!(!norm.ignore_numeric_values);
415
416        // Weak mode defaults: blind strings only
417        let norm =
418            ResolvedNormalization::resolve(DetectionMode::Weak, &NormalizationConfig::default());
419        assert!(!norm.ignore_identifiers);
420        assert!(norm.ignore_string_values);
421        assert!(!norm.ignore_numeric_values);
422
423        // Semantic mode defaults: blind all
424        let norm = ResolvedNormalization::resolve(
425            DetectionMode::Semantic,
426            &NormalizationConfig::default(),
427        );
428        assert!(norm.ignore_identifiers);
429        assert!(norm.ignore_string_values);
430        assert!(norm.ignore_numeric_values);
431    }
432
433    #[test]
434    fn resolved_normalization_overrides_mode_defaults() {
435        use fallow_config::NormalizationConfig;
436
437        // Strict mode with explicit override to blind identifiers
438        let overrides = NormalizationConfig {
439            ignore_identifiers: Some(true),
440            ignore_string_values: None, // Use mode default (false)
441            ignore_numeric_values: None,
442        };
443        let norm = ResolvedNormalization::resolve(DetectionMode::Strict, &overrides);
444        assert!(norm.ignore_identifiers); // Overridden
445        assert!(!norm.ignore_string_values); // Mode default
446        assert!(!norm.ignore_numeric_values); // Mode default
447    }
448
449    mod proptests {
450        use super::*;
451        use crate::duplicates::tokenize::{KeywordType, OperatorType, PunctuationType};
452        use oxc_span::Span;
453        use proptest::prelude::*;
454
455        fn make_token(kind: TokenKind) -> SourceToken {
456            SourceToken {
457                kind,
458                span: Span::new(0, 0),
459            }
460        }
461
462        fn arb_detection_mode() -> impl Strategy<Value = DetectionMode> {
463            prop::sample::select(vec![
464                DetectionMode::Strict,
465                DetectionMode::Mild,
466                DetectionMode::Weak,
467                DetectionMode::Semantic,
468            ])
469        }
470
471        fn arb_normalization() -> impl Strategy<Value = ResolvedNormalization> {
472            (any::<bool>(), any::<bool>(), any::<bool>()).prop_map(|(ids, strings, nums)| {
473                ResolvedNormalization {
474                    ignore_identifiers: ids,
475                    ignore_string_values: strings,
476                    ignore_numeric_values: nums,
477                }
478            })
479        }
480
481        fn arb_token_kind() -> impl Strategy<Value = TokenKind> {
482            prop_oneof![
483                Just(TokenKind::Keyword(KeywordType::Const)),
484                Just(TokenKind::Keyword(KeywordType::If)),
485                Just(TokenKind::Keyword(KeywordType::Return)),
486                "[a-zA-Z_][a-zA-Z0-9_]{0,30}".prop_map(TokenKind::Identifier),
487                "[a-zA-Z0-9 _.,!?]{0,50}".prop_map(TokenKind::StringLiteral),
488                "[0-9]{1,10}(\\.[0-9]{1,5})?".prop_map(TokenKind::NumericLiteral),
489                any::<bool>().prop_map(TokenKind::BooleanLiteral),
490                Just(TokenKind::NullLiteral),
491                Just(TokenKind::TemplateLiteral),
492                Just(TokenKind::RegExpLiteral),
493                Just(TokenKind::Operator(OperatorType::Add)),
494                Just(TokenKind::Operator(OperatorType::Assign)),
495                Just(TokenKind::Punctuation(PunctuationType::OpenParen)),
496                Just(TokenKind::Punctuation(PunctuationType::CloseParen)),
497            ]
498        }
499
500        proptest! {
501            /// Normalizing a token twice produces the same result as normalizing once (idempotency).
502            #[test]
503            fn normalization_is_idempotent(
504                kind in arb_token_kind(),
505                norm in arb_normalization(),
506            ) {
507                let token = make_token(kind);
508                let first = normalize_and_hash_resolved(std::slice::from_ref(&token), norm);
509                // The hash is computed directly from the token kind + normalization flags.
510                // Running it again on the same input must yield the same hash.
511                let second = normalize_and_hash_resolved(&[token], norm);
512                prop_assert_eq!(first.len(), second.len());
513                for (a, b) in first.iter().zip(second.iter()) {
514                    prop_assert_eq!(a.hash, b.hash, "Normalization should be idempotent");
515                }
516            }
517
518            /// Same input always produces the same output (determinism).
519            #[test]
520            fn normalization_is_deterministic(
521                kinds in prop::collection::vec(arb_token_kind(), 1..20),
522                mode in arb_detection_mode(),
523            ) {
524                let tokens: Vec<SourceToken> = kinds.into_iter().map(make_token).collect();
525                let result1 = normalize_and_hash(&tokens, mode);
526                let result2 = normalize_and_hash(&tokens, mode);
527                prop_assert_eq!(result1.len(), result2.len());
528                for (a, b) in result1.iter().zip(result2.iter()) {
529                    prop_assert_eq!(a.hash, b.hash, "Same input must produce same hash");
530                    prop_assert_eq!(a.original_index, b.original_index);
531                }
532            }
533
534            /// Output length always equals input length (no tokens are filtered).
535            #[test]
536            fn output_length_matches_input(
537                kinds in prop::collection::vec(arb_token_kind(), 0..30),
538                mode in arb_detection_mode(),
539            ) {
540                let tokens: Vec<SourceToken> = kinds.into_iter().map(make_token).collect();
541                let result = normalize_and_hash(&tokens, mode);
542                prop_assert_eq!(
543                    result.len(), tokens.len(),
544                    "Output should have same length as input"
545                );
546            }
547
548            /// Original indices should be sequential 0..n.
549            #[test]
550            fn original_indices_are_sequential(
551                kinds in prop::collection::vec(arb_token_kind(), 1..20),
552                norm in arb_normalization(),
553            ) {
554                let tokens: Vec<SourceToken> = kinds.into_iter().map(make_token).collect();
555                let result = normalize_and_hash_resolved(&tokens, norm);
556                for (i, hashed) in result.iter().enumerate() {
557                    prop_assert_eq!(hashed.original_index, i);
558                }
559            }
560        }
561    }
562}