Skip to main content

fallow_core/duplicates/
normalize.rs

1use xxhash_rust::xxh3::xxh3_64;
2
3use super::tokenize::{SourceToken, TokenKind};
4use fallow_config::{DetectionMode, NormalizationConfig, ResolvedNormalization};
5
6/// A token with a precomputed hash for use in the detection engine.
7#[derive(Debug, Clone)]
8pub struct HashedToken {
9    /// Hash of the normalized token.
10    pub hash: u64,
11    /// Index of this token in the original (pre-normalization) token sequence.
12    pub original_index: usize,
13}
14
15/// Normalize and hash a token sequence according to the detection mode.
16///
17/// Returns a vector of `HashedToken` values ready for the Rabin-Karp sliding window.
18/// Tokens that should be skipped (based on mode) are excluded from the output.
19#[must_use]
20pub fn normalize_and_hash(tokens: &[SourceToken], mode: DetectionMode) -> Vec<HashedToken> {
21    let resolved = ResolvedNormalization::resolve(mode, &NormalizationConfig::default());
22    normalize_and_hash_resolved(tokens, resolved)
23}
24
25/// Normalize and hash with explicit resolved normalization flags.
26///
27/// This is the primary normalization entry point when using configurable overrides.
28#[must_use]
29pub fn normalize_and_hash_resolved(
30    tokens: &[SourceToken],
31    normalization: ResolvedNormalization,
32) -> Vec<HashedToken> {
33    let mut result = Vec::with_capacity(tokens.len());
34
35    for (i, token) in tokens.iter().enumerate() {
36        let hash = hash_token_resolved(&token.kind, normalization);
37        result.push(HashedToken {
38            hash,
39            original_index: i,
40        });
41    }
42
43    result
44}
45
46/// Hash a single token using resolved normalization flags.
47fn hash_token_resolved(kind: &TokenKind, norm: ResolvedNormalization) -> u64 {
48    match kind {
49        TokenKind::Keyword(kw) => hash_bytes(&[0, *kw as u8]),
50        TokenKind::Identifier(name) => {
51            if norm.ignore_identifiers {
52                hash_bytes(&[1, 0])
53            } else {
54                let mut buf = vec![1];
55                buf.extend_from_slice(name.as_bytes());
56                hash_bytes(&buf)
57            }
58        }
59        TokenKind::StringLiteral(val) => {
60            if norm.ignore_string_values {
61                hash_bytes(&[2, 0])
62            } else {
63                let mut buf = vec![2];
64                buf.extend_from_slice(val.as_bytes());
65                hash_bytes(&buf)
66            }
67        }
68        TokenKind::NumericLiteral(val) => {
69            if norm.ignore_numeric_values {
70                hash_bytes(&[3, 0])
71            } else {
72                let mut buf = vec![3];
73                buf.extend_from_slice(val.as_bytes());
74                hash_bytes(&buf)
75            }
76        }
77        TokenKind::BooleanLiteral(val) => hash_bytes(&[4, u8::from(*val)]),
78        TokenKind::NullLiteral => hash_bytes(&[5]),
79        TokenKind::TemplateLiteral => hash_bytes(&[6]),
80        TokenKind::RegExpLiteral => hash_bytes(&[7]),
81        TokenKind::Operator(op) => hash_bytes(&[8, *op as u8]),
82        TokenKind::Punctuation(p) => hash_bytes(&[9, *p as u8]),
83    }
84}
85
86/// Hash a byte slice using xxh3.
87fn hash_bytes(data: &[u8]) -> u64 {
88    xxh3_64(data)
89}
90
91#[cfg(test)]
92mod tests {
93    use super::*;
94    use crate::duplicates::tokenize::{KeywordType, OperatorType, PunctuationType};
95    use oxc_span::Span;
96
97    fn make_token(kind: TokenKind) -> SourceToken {
98        SourceToken {
99            kind,
100            span: Span::new(0, 0),
101        }
102    }
103
104    #[test]
105    fn strict_mode_preserves_identifiers() {
106        let tokens = vec![
107            make_token(TokenKind::Identifier("foo".to_string())),
108            make_token(TokenKind::Identifier("bar".to_string())),
109        ];
110
111        let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
112        assert_eq!(hashed.len(), 2);
113        assert_ne!(hashed[0].hash, hashed[1].hash);
114    }
115
116    #[test]
117    fn semantic_mode_blinds_identifiers() {
118        let tokens = vec![
119            make_token(TokenKind::Identifier("foo".to_string())),
120            make_token(TokenKind::Identifier("bar".to_string())),
121        ];
122
123        let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
124        assert_eq!(hashed.len(), 2);
125        assert_eq!(hashed[0].hash, hashed[1].hash);
126    }
127
128    #[test]
129    fn semantic_mode_blinds_string_literals() {
130        let tokens = vec![
131            make_token(TokenKind::StringLiteral("hello".to_string())),
132            make_token(TokenKind::StringLiteral("world".to_string())),
133        ];
134
135        let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
136        assert_eq!(hashed.len(), 2);
137        assert_eq!(hashed[0].hash, hashed[1].hash);
138    }
139
140    #[test]
141    fn semantic_mode_blinds_numeric_literals() {
142        let tokens = vec![
143            make_token(TokenKind::NumericLiteral("42".to_string())),
144            make_token(TokenKind::NumericLiteral("99".to_string())),
145        ];
146
147        let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
148        assert_eq!(hashed.len(), 2);
149        assert_eq!(hashed[0].hash, hashed[1].hash);
150    }
151
152    #[test]
153    fn semantic_mode_preserves_booleans() {
154        let tokens = vec![
155            make_token(TokenKind::BooleanLiteral(true)),
156            make_token(TokenKind::BooleanLiteral(false)),
157        ];
158
159        let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
160        assert_eq!(hashed.len(), 2);
161        assert_ne!(hashed[0].hash, hashed[1].hash);
162    }
163
164    #[test]
165    fn semantic_mode_preserves_keywords() {
166        let tokens = vec![
167            make_token(TokenKind::Keyword(KeywordType::If)),
168            make_token(TokenKind::Keyword(KeywordType::While)),
169        ];
170
171        let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
172        assert_eq!(hashed.len(), 2);
173        assert_ne!(hashed[0].hash, hashed[1].hash);
174    }
175
176    #[test]
177    fn preserves_original_indices() {
178        let tokens = vec![
179            make_token(TokenKind::Keyword(KeywordType::Const)),
180            make_token(TokenKind::Identifier("x".to_string())),
181            make_token(TokenKind::Operator(OperatorType::Assign)),
182        ];
183
184        let hashed = normalize_and_hash(&tokens, DetectionMode::Mild);
185        assert_eq!(hashed.len(), 3);
186        assert_eq!(hashed[0].original_index, 0);
187        assert_eq!(hashed[1].original_index, 1);
188        assert_eq!(hashed[2].original_index, 2);
189    }
190
191    #[test]
192    fn empty_input_produces_empty_output() {
193        let tokens: Vec<SourceToken> = vec![];
194        let hashed = normalize_and_hash(&tokens, DetectionMode::Mild);
195        assert!(hashed.is_empty());
196    }
197
198    #[test]
199    fn operators_have_distinct_hashes() {
200        let tokens = vec![
201            make_token(TokenKind::Operator(OperatorType::Add)),
202            make_token(TokenKind::Operator(OperatorType::Sub)),
203        ];
204
205        let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
206        assert_ne!(hashed[0].hash, hashed[1].hash);
207    }
208
209    #[test]
210    fn punctuation_has_distinct_hashes() {
211        let tokens = vec![
212            make_token(TokenKind::Punctuation(PunctuationType::OpenParen)),
213            make_token(TokenKind::Punctuation(PunctuationType::CloseParen)),
214        ];
215
216        let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
217        assert_ne!(hashed[0].hash, hashed[1].hash);
218    }
219
220    #[test]
221    fn null_literal_has_stable_hash() {
222        let tokens = vec![make_token(TokenKind::NullLiteral)];
223        let h1 = normalize_and_hash(&tokens, DetectionMode::Strict);
224        let h2 = normalize_and_hash(&tokens, DetectionMode::Semantic);
225        assert_eq!(h1[0].hash, h2[0].hash);
226    }
227
228    #[test]
229    fn template_literal_has_stable_hash() {
230        let tokens = vec![make_token(TokenKind::TemplateLiteral)];
231        let h1 = normalize_and_hash(&tokens, DetectionMode::Strict);
232        let h2 = normalize_and_hash(&tokens, DetectionMode::Semantic);
233        assert_eq!(h1[0].hash, h2[0].hash);
234    }
235
236    #[test]
237    fn regexp_literal_has_stable_hash() {
238        let tokens = vec![make_token(TokenKind::RegExpLiteral)];
239        let h1 = normalize_and_hash(&tokens, DetectionMode::Strict);
240        let h2 = normalize_and_hash(&tokens, DetectionMode::Semantic);
241        assert_eq!(h1[0].hash, h2[0].hash);
242    }
243
244    #[test]
245    fn null_template_regexp_have_distinct_hashes() {
246        let tokens = vec![
247            make_token(TokenKind::NullLiteral),
248            make_token(TokenKind::TemplateLiteral),
249            make_token(TokenKind::RegExpLiteral),
250        ];
251        let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
252        assert_ne!(hashed[0].hash, hashed[1].hash);
253        assert_ne!(hashed[1].hash, hashed[2].hash);
254        assert_ne!(hashed[0].hash, hashed[2].hash);
255    }
256
257    #[test]
258    fn mild_mode_equivalent_to_strict() {
259        let id_tokens = vec![
260            make_token(TokenKind::Identifier("foo".to_string())),
261            make_token(TokenKind::Identifier("bar".to_string())),
262        ];
263        let hashed = normalize_and_hash(&id_tokens, DetectionMode::Mild);
264        assert_ne!(hashed[0].hash, hashed[1].hash);
265
266        let str_tokens = vec![
267            make_token(TokenKind::StringLiteral("hello".to_string())),
268            make_token(TokenKind::StringLiteral("world".to_string())),
269        ];
270        let hashed = normalize_and_hash(&str_tokens, DetectionMode::Mild);
271        assert_ne!(hashed[0].hash, hashed[1].hash);
272
273        let num_tokens = vec![
274            make_token(TokenKind::NumericLiteral("42".to_string())),
275            make_token(TokenKind::NumericLiteral("99".to_string())),
276        ];
277        let hashed = normalize_and_hash(&num_tokens, DetectionMode::Mild);
278        assert_ne!(hashed[0].hash, hashed[1].hash);
279    }
280
281    #[test]
282    fn weak_mode_blinds_strings_only() {
283        let id_tokens = vec![
284            make_token(TokenKind::Identifier("foo".to_string())),
285            make_token(TokenKind::Identifier("bar".to_string())),
286        ];
287        let hashed = normalize_and_hash(&id_tokens, DetectionMode::Weak);
288        assert_ne!(hashed[0].hash, hashed[1].hash, "Weak preserves identifiers");
289
290        let num_tokens = vec![
291            make_token(TokenKind::NumericLiteral("42".to_string())),
292            make_token(TokenKind::NumericLiteral("99".to_string())),
293        ];
294        let hashed = normalize_and_hash(&num_tokens, DetectionMode::Weak);
295        assert_ne!(hashed[0].hash, hashed[1].hash, "Weak preserves numbers");
296    }
297
298    #[test]
299    fn different_token_kinds_produce_distinct_hashes() {
300        let tokens = vec![
301            make_token(TokenKind::Keyword(KeywordType::Const)),
302            make_token(TokenKind::Identifier("x".to_string())),
303            make_token(TokenKind::StringLiteral("x".to_string())),
304            make_token(TokenKind::NumericLiteral("1".to_string())),
305            make_token(TokenKind::BooleanLiteral(true)),
306            make_token(TokenKind::NullLiteral),
307            make_token(TokenKind::TemplateLiteral),
308            make_token(TokenKind::RegExpLiteral),
309            make_token(TokenKind::Operator(OperatorType::Add)),
310            make_token(TokenKind::Punctuation(PunctuationType::OpenParen)),
311        ];
312        let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
313        for i in 0..hashed.len() {
314            for j in (i + 1)..hashed.len() {
315                assert_ne!(
316                    hashed[i].hash, hashed[j].hash,
317                    "Token at index {i} and {j} should have distinct hashes"
318                );
319            }
320        }
321    }
322
323    #[test]
324    fn resolved_strict_with_ignore_identifiers_override() {
325        let norm = ResolvedNormalization {
326            ignore_identifiers: true,
327            ignore_string_values: false,
328            ignore_numeric_values: false,
329        };
330        let tokens = vec![
331            make_token(TokenKind::Identifier("foo".to_string())),
332            make_token(TokenKind::Identifier("bar".to_string())),
333        ];
334
335        let hashed = normalize_and_hash_resolved(&tokens, norm);
336        assert_eq!(hashed.len(), 2);
337        assert_eq!(hashed[0].hash, hashed[1].hash);
338    }
339
340    #[test]
341    fn resolved_strict_with_ignore_strings_override() {
342        let norm = ResolvedNormalization {
343            ignore_identifiers: false,
344            ignore_string_values: true,
345            ignore_numeric_values: false,
346        };
347        let tokens = vec![
348            make_token(TokenKind::StringLiteral("hello".to_string())),
349            make_token(TokenKind::StringLiteral("world".to_string())),
350        ];
351
352        let hashed = normalize_and_hash_resolved(&tokens, norm);
353        assert_eq!(hashed[0].hash, hashed[1].hash);
354    }
355
356    #[test]
357    fn resolved_strict_with_ignore_numbers_override() {
358        let norm = ResolvedNormalization {
359            ignore_identifiers: false,
360            ignore_string_values: false,
361            ignore_numeric_values: true,
362        };
363        let tokens = vec![
364            make_token(TokenKind::NumericLiteral("42".to_string())),
365            make_token(TokenKind::NumericLiteral("99".to_string())),
366        ];
367
368        let hashed = normalize_and_hash_resolved(&tokens, norm);
369        assert_eq!(hashed[0].hash, hashed[1].hash);
370    }
371
372    #[test]
373    fn resolved_semantic_with_preserve_identifiers_override() {
374        let norm = ResolvedNormalization {
375            ignore_identifiers: false,
376            ignore_string_values: true,
377            ignore_numeric_values: true,
378        };
379        let tokens = vec![
380            make_token(TokenKind::Identifier("foo".to_string())),
381            make_token(TokenKind::Identifier("bar".to_string())),
382        ];
383
384        let hashed = normalize_and_hash_resolved(&tokens, norm);
385        assert_ne!(hashed[0].hash, hashed[1].hash);
386    }
387
388    #[test]
389    fn resolved_normalization_from_mode_defaults() {
390        use fallow_config::NormalizationConfig;
391
392        let norm =
393            ResolvedNormalization::resolve(DetectionMode::Strict, &NormalizationConfig::default());
394        assert!(!norm.ignore_identifiers);
395        assert!(!norm.ignore_string_values);
396        assert!(!norm.ignore_numeric_values);
397
398        let norm =
399            ResolvedNormalization::resolve(DetectionMode::Weak, &NormalizationConfig::default());
400        assert!(!norm.ignore_identifiers);
401        assert!(norm.ignore_string_values);
402        assert!(!norm.ignore_numeric_values);
403
404        let norm = ResolvedNormalization::resolve(
405            DetectionMode::Semantic,
406            &NormalizationConfig::default(),
407        );
408        assert!(norm.ignore_identifiers);
409        assert!(norm.ignore_string_values);
410        assert!(norm.ignore_numeric_values);
411    }
412
413    #[test]
414    fn resolved_normalization_overrides_mode_defaults() {
415        use fallow_config::NormalizationConfig;
416
417        let overrides = NormalizationConfig {
418            ignore_identifiers: Some(true),
419            ignore_string_values: None,
420            ignore_numeric_values: None,
421        };
422        let norm = ResolvedNormalization::resolve(DetectionMode::Strict, &overrides);
423        assert!(norm.ignore_identifiers);
424        assert!(!norm.ignore_string_values);
425        assert!(!norm.ignore_numeric_values);
426    }
427
428    mod proptests {
429        use super::*;
430        use crate::duplicates::tokenize::{KeywordType, OperatorType, PunctuationType};
431        use oxc_span::Span;
432        use proptest::prelude::*;
433
434        fn make_token(kind: TokenKind) -> SourceToken {
435            SourceToken {
436                kind,
437                span: Span::new(0, 0),
438            }
439        }
440
441        fn arb_detection_mode() -> impl Strategy<Value = DetectionMode> {
442            prop::sample::select(vec![
443                DetectionMode::Strict,
444                DetectionMode::Mild,
445                DetectionMode::Weak,
446                DetectionMode::Semantic,
447            ])
448        }
449
450        fn arb_normalization() -> impl Strategy<Value = ResolvedNormalization> {
451            (any::<bool>(), any::<bool>(), any::<bool>()).prop_map(|(ids, strings, nums)| {
452                ResolvedNormalization {
453                    ignore_identifiers: ids,
454                    ignore_string_values: strings,
455                    ignore_numeric_values: nums,
456                }
457            })
458        }
459
460        fn arb_token_kind() -> impl Strategy<Value = TokenKind> {
461            prop_oneof![
462                Just(TokenKind::Keyword(KeywordType::Const)),
463                Just(TokenKind::Keyword(KeywordType::If)),
464                Just(TokenKind::Keyword(KeywordType::Return)),
465                "[a-zA-Z_][a-zA-Z0-9_]{0,30}".prop_map(TokenKind::Identifier),
466                "[a-zA-Z0-9 _.,!?]{0,50}".prop_map(TokenKind::StringLiteral),
467                "[0-9]{1,10}(\\.[0-9]{1,5})?".prop_map(TokenKind::NumericLiteral),
468                any::<bool>().prop_map(TokenKind::BooleanLiteral),
469                Just(TokenKind::NullLiteral),
470                Just(TokenKind::TemplateLiteral),
471                Just(TokenKind::RegExpLiteral),
472                Just(TokenKind::Operator(OperatorType::Add)),
473                Just(TokenKind::Operator(OperatorType::Assign)),
474                Just(TokenKind::Punctuation(PunctuationType::OpenParen)),
475                Just(TokenKind::Punctuation(PunctuationType::CloseParen)),
476            ]
477        }
478
479        proptest! {
480            /// Normalizing a token twice produces the same result as normalizing once (idempotency).
481            #[test]
482            fn normalization_is_idempotent(
483                kind in arb_token_kind(),
484                norm in arb_normalization(),
485            ) {
486                let token = make_token(kind);
487                let first = normalize_and_hash_resolved(std::slice::from_ref(&token), norm);
488                let second = normalize_and_hash_resolved(&[token], norm);
489                prop_assert_eq!(first.len(), second.len());
490                for (a, b) in first.iter().zip(second.iter()) {
491                    prop_assert_eq!(a.hash, b.hash, "Normalization should be idempotent");
492                }
493            }
494
495            /// Same input always produces the same output (determinism).
496            #[test]
497            fn normalization_is_deterministic(
498                kinds in prop::collection::vec(arb_token_kind(), 1..20),
499                mode in arb_detection_mode(),
500            ) {
501                let tokens: Vec<SourceToken> = kinds.into_iter().map(make_token).collect();
502                let result1 = normalize_and_hash(&tokens, mode);
503                let result2 = normalize_and_hash(&tokens, mode);
504                prop_assert_eq!(result1.len(), result2.len());
505                for (a, b) in result1.iter().zip(result2.iter()) {
506                    prop_assert_eq!(a.hash, b.hash, "Same input must produce same hash");
507                    prop_assert_eq!(a.original_index, b.original_index);
508                }
509            }
510
511            /// Output length always equals input length (no tokens are filtered).
512            #[test]
513            fn output_length_matches_input(
514                kinds in prop::collection::vec(arb_token_kind(), 0..30),
515                mode in arb_detection_mode(),
516            ) {
517                let tokens: Vec<SourceToken> = kinds.into_iter().map(make_token).collect();
518                let result = normalize_and_hash(&tokens, mode);
519                prop_assert_eq!(
520                    result.len(), tokens.len(),
521                    "Output should have same length as input"
522                );
523            }
524
525            /// Original indices should be sequential 0..n.
526            #[test]
527            fn original_indices_are_sequential(
528                kinds in prop::collection::vec(arb_token_kind(), 1..20),
529                norm in arb_normalization(),
530            ) {
531                let tokens: Vec<SourceToken> = kinds.into_iter().map(make_token).collect();
532                let result = normalize_and_hash_resolved(&tokens, norm);
533                for (i, hashed) in result.iter().enumerate() {
534                    prop_assert_eq!(hashed.original_index, i);
535                }
536            }
537        }
538    }
539}