Skip to main content

fallow_core/duplicates/
normalize.rs

1use xxhash_rust::xxh3::xxh3_64;
2
3use super::tokenize::{SourceToken, TokenKind};
4use fallow_config::{DetectionMode, NormalizationConfig, ResolvedNormalization};
5
6/// A token with a precomputed hash for use in the detection engine.
7#[derive(Debug, Clone)]
8pub struct HashedToken {
9    /// Hash of the normalized token.
10    pub hash: u64,
11    /// Index of this token in the original (pre-normalization) token sequence.
12    pub original_index: usize,
13}
14
15/// Normalize and hash a token sequence according to the detection mode.
16///
17/// Returns a vector of `HashedToken` values ready for the Rabin-Karp sliding window.
18/// Tokens that should be skipped (based on mode) are excluded from the output.
19#[must_use]
20pub fn normalize_and_hash(tokens: &[SourceToken], mode: DetectionMode) -> Vec<HashedToken> {
21    let resolved = ResolvedNormalization::resolve(mode, &NormalizationConfig::default());
22    normalize_and_hash_resolved(tokens, resolved)
23}
24
25/// Normalize and hash with explicit resolved normalization flags.
26///
27/// This is the primary normalization entry point when using configurable overrides.
28#[must_use]
29pub fn normalize_and_hash_resolved(
30    tokens: &[SourceToken],
31    normalization: ResolvedNormalization,
32) -> Vec<HashedToken> {
33    let mut result = Vec::with_capacity(tokens.len());
34    let mut namespace = "js";
35
36    for (i, token) in tokens.iter().enumerate() {
37        if let TokenKind::Boundary(name) = &token.kind {
38            namespace = name;
39        }
40        let hash = hash_token_resolved(&token.kind, normalization, namespace);
41        result.push(HashedToken {
42            hash,
43            original_index: i,
44        });
45    }
46
47    result
48}
49
50/// Hash a single token using resolved normalization flags.
51fn hash_token_resolved(kind: &TokenKind, norm: ResolvedNormalization, namespace: &str) -> u64 {
52    match kind {
53        TokenKind::Keyword(kw) => hash_namespaced(namespace, &[0, *kw as u8]),
54        TokenKind::Identifier(name) => {
55            if norm.ignore_identifiers {
56                hash_namespaced(namespace, &[1, 0])
57            } else {
58                let mut buf = vec![1];
59                buf.extend_from_slice(name.as_bytes());
60                hash_namespaced(namespace, &buf)
61            }
62        }
63        TokenKind::StringLiteral(val) => {
64            if norm.ignore_string_values {
65                hash_namespaced(namespace, &[2, 0])
66            } else {
67                let mut buf = vec![2];
68                buf.extend_from_slice(val.as_bytes());
69                hash_namespaced(namespace, &buf)
70            }
71        }
72        TokenKind::NumericLiteral(val) => {
73            if norm.ignore_numeric_values {
74                hash_namespaced(namespace, &[3, 0])
75            } else {
76                let mut buf = vec![3];
77                buf.extend_from_slice(val.as_bytes());
78                hash_namespaced(namespace, &buf)
79            }
80        }
81        TokenKind::BooleanLiteral(val) => hash_namespaced(namespace, &[4, u8::from(*val)]),
82        TokenKind::NullLiteral => hash_namespaced(namespace, &[5]),
83        TokenKind::TemplateLiteral => hash_namespaced(namespace, &[6]),
84        TokenKind::RegExpLiteral => hash_namespaced(namespace, &[7]),
85        TokenKind::Operator(op) => hash_namespaced(namespace, &[8, *op as u8]),
86        TokenKind::Punctuation(p) => hash_namespaced(namespace, &[9, *p as u8]),
87        TokenKind::Boundary(name) => {
88            let mut buf = vec![10];
89            buf.extend_from_slice(name.as_bytes());
90            hash_bytes(&buf)
91        }
92    }
93}
94
95fn hash_namespaced(namespace: &str, data: &[u8]) -> u64 {
96    let mut buf = Vec::with_capacity(namespace.len() + 1 + data.len());
97    buf.extend_from_slice(namespace.as_bytes());
98    buf.push(0);
99    buf.extend_from_slice(data);
100    hash_bytes(&buf)
101}
102
103/// Hash a byte slice using xxh3.
104fn hash_bytes(data: &[u8]) -> u64 {
105    xxh3_64(data)
106}
107
108#[cfg(test)]
109mod tests {
110    use super::*;
111    use crate::duplicates::tokenize::{KeywordType, OperatorType, PunctuationType};
112    use oxc_span::Span;
113
114    fn make_token(kind: TokenKind) -> SourceToken {
115        SourceToken {
116            kind,
117            span: Span::new(0, 0),
118        }
119    }
120
121    #[test]
122    fn strict_mode_preserves_identifiers() {
123        let tokens = vec![
124            make_token(TokenKind::Identifier("foo".to_string())),
125            make_token(TokenKind::Identifier("bar".to_string())),
126        ];
127
128        let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
129        assert_eq!(hashed.len(), 2);
130        assert_ne!(hashed[0].hash, hashed[1].hash);
131    }
132
133    #[test]
134    fn semantic_mode_blinds_identifiers() {
135        let tokens = vec![
136            make_token(TokenKind::Identifier("foo".to_string())),
137            make_token(TokenKind::Identifier("bar".to_string())),
138        ];
139
140        let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
141        assert_eq!(hashed.len(), 2);
142        assert_eq!(hashed[0].hash, hashed[1].hash);
143    }
144
145    #[test]
146    fn semantic_mode_blinds_string_literals() {
147        let tokens = vec![
148            make_token(TokenKind::StringLiteral("hello".to_string())),
149            make_token(TokenKind::StringLiteral("world".to_string())),
150        ];
151
152        let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
153        assert_eq!(hashed.len(), 2);
154        assert_eq!(hashed[0].hash, hashed[1].hash);
155    }
156
157    #[test]
158    fn semantic_mode_blinds_numeric_literals() {
159        let tokens = vec![
160            make_token(TokenKind::NumericLiteral("42".to_string())),
161            make_token(TokenKind::NumericLiteral("99".to_string())),
162        ];
163
164        let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
165        assert_eq!(hashed.len(), 2);
166        assert_eq!(hashed[0].hash, hashed[1].hash);
167    }
168
169    #[test]
170    fn semantic_mode_preserves_booleans() {
171        let tokens = vec![
172            make_token(TokenKind::BooleanLiteral(true)),
173            make_token(TokenKind::BooleanLiteral(false)),
174        ];
175
176        let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
177        assert_eq!(hashed.len(), 2);
178        assert_ne!(hashed[0].hash, hashed[1].hash);
179    }
180
181    #[test]
182    fn semantic_mode_preserves_keywords() {
183        let tokens = vec![
184            make_token(TokenKind::Keyword(KeywordType::If)),
185            make_token(TokenKind::Keyword(KeywordType::While)),
186        ];
187
188        let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
189        assert_eq!(hashed.len(), 2);
190        assert_ne!(hashed[0].hash, hashed[1].hash);
191    }
192
193    #[test]
194    fn preserves_original_indices() {
195        let tokens = vec![
196            make_token(TokenKind::Keyword(KeywordType::Const)),
197            make_token(TokenKind::Identifier("x".to_string())),
198            make_token(TokenKind::Operator(OperatorType::Assign)),
199        ];
200
201        let hashed = normalize_and_hash(&tokens, DetectionMode::Mild);
202        assert_eq!(hashed.len(), 3);
203        assert_eq!(hashed[0].original_index, 0);
204        assert_eq!(hashed[1].original_index, 1);
205        assert_eq!(hashed[2].original_index, 2);
206    }
207
208    #[test]
209    fn empty_input_produces_empty_output() {
210        let tokens: Vec<SourceToken> = vec![];
211        let hashed = normalize_and_hash(&tokens, DetectionMode::Mild);
212        assert!(hashed.is_empty());
213    }
214
215    #[test]
216    fn operators_have_distinct_hashes() {
217        let tokens = vec![
218            make_token(TokenKind::Operator(OperatorType::Add)),
219            make_token(TokenKind::Operator(OperatorType::Sub)),
220        ];
221
222        let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
223        assert_ne!(hashed[0].hash, hashed[1].hash);
224    }
225
226    #[test]
227    fn punctuation_has_distinct_hashes() {
228        let tokens = vec![
229            make_token(TokenKind::Punctuation(PunctuationType::OpenParen)),
230            make_token(TokenKind::Punctuation(PunctuationType::CloseParen)),
231        ];
232
233        let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
234        assert_ne!(hashed[0].hash, hashed[1].hash);
235    }
236
237    #[test]
238    fn null_literal_has_stable_hash() {
239        let tokens = vec![make_token(TokenKind::NullLiteral)];
240        let h1 = normalize_and_hash(&tokens, DetectionMode::Strict);
241        let h2 = normalize_and_hash(&tokens, DetectionMode::Semantic);
242        assert_eq!(h1[0].hash, h2[0].hash);
243    }
244
245    #[test]
246    fn template_literal_has_stable_hash() {
247        let tokens = vec![make_token(TokenKind::TemplateLiteral)];
248        let h1 = normalize_and_hash(&tokens, DetectionMode::Strict);
249        let h2 = normalize_and_hash(&tokens, DetectionMode::Semantic);
250        assert_eq!(h1[0].hash, h2[0].hash);
251    }
252
253    #[test]
254    fn regexp_literal_has_stable_hash() {
255        let tokens = vec![make_token(TokenKind::RegExpLiteral)];
256        let h1 = normalize_and_hash(&tokens, DetectionMode::Strict);
257        let h2 = normalize_and_hash(&tokens, DetectionMode::Semantic);
258        assert_eq!(h1[0].hash, h2[0].hash);
259    }
260
261    #[test]
262    fn null_template_regexp_have_distinct_hashes() {
263        let tokens = vec![
264            make_token(TokenKind::NullLiteral),
265            make_token(TokenKind::TemplateLiteral),
266            make_token(TokenKind::RegExpLiteral),
267        ];
268        let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
269        assert_ne!(hashed[0].hash, hashed[1].hash);
270        assert_ne!(hashed[1].hash, hashed[2].hash);
271        assert_ne!(hashed[0].hash, hashed[2].hash);
272    }
273
274    #[test]
275    fn mild_mode_equivalent_to_strict() {
276        let id_tokens = vec![
277            make_token(TokenKind::Identifier("foo".to_string())),
278            make_token(TokenKind::Identifier("bar".to_string())),
279        ];
280        let hashed = normalize_and_hash(&id_tokens, DetectionMode::Mild);
281        assert_ne!(hashed[0].hash, hashed[1].hash);
282
283        let str_tokens = vec![
284            make_token(TokenKind::StringLiteral("hello".to_string())),
285            make_token(TokenKind::StringLiteral("world".to_string())),
286        ];
287        let hashed = normalize_and_hash(&str_tokens, DetectionMode::Mild);
288        assert_ne!(hashed[0].hash, hashed[1].hash);
289
290        let num_tokens = vec![
291            make_token(TokenKind::NumericLiteral("42".to_string())),
292            make_token(TokenKind::NumericLiteral("99".to_string())),
293        ];
294        let hashed = normalize_and_hash(&num_tokens, DetectionMode::Mild);
295        assert_ne!(hashed[0].hash, hashed[1].hash);
296    }
297
298    #[test]
299    fn weak_mode_blinds_strings_only() {
300        let id_tokens = vec![
301            make_token(TokenKind::Identifier("foo".to_string())),
302            make_token(TokenKind::Identifier("bar".to_string())),
303        ];
304        let hashed = normalize_and_hash(&id_tokens, DetectionMode::Weak);
305        assert_ne!(hashed[0].hash, hashed[1].hash, "Weak preserves identifiers");
306
307        let num_tokens = vec![
308            make_token(TokenKind::NumericLiteral("42".to_string())),
309            make_token(TokenKind::NumericLiteral("99".to_string())),
310        ];
311        let hashed = normalize_and_hash(&num_tokens, DetectionMode::Weak);
312        assert_ne!(hashed[0].hash, hashed[1].hash, "Weak preserves numbers");
313    }
314
315    #[test]
316    fn different_token_kinds_produce_distinct_hashes() {
317        let tokens = vec![
318            make_token(TokenKind::Keyword(KeywordType::Const)),
319            make_token(TokenKind::Identifier("x".to_string())),
320            make_token(TokenKind::StringLiteral("x".to_string())),
321            make_token(TokenKind::NumericLiteral("1".to_string())),
322            make_token(TokenKind::BooleanLiteral(true)),
323            make_token(TokenKind::NullLiteral),
324            make_token(TokenKind::TemplateLiteral),
325            make_token(TokenKind::RegExpLiteral),
326            make_token(TokenKind::Operator(OperatorType::Add)),
327            make_token(TokenKind::Punctuation(PunctuationType::OpenParen)),
328        ];
329        let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
330        for i in 0..hashed.len() {
331            for j in (i + 1)..hashed.len() {
332                assert_ne!(
333                    hashed[i].hash, hashed[j].hash,
334                    "Token at index {i} and {j} should have distinct hashes"
335                );
336            }
337        }
338    }
339
340    #[test]
341    fn resolved_strict_with_ignore_identifiers_override() {
342        let norm = ResolvedNormalization {
343            ignore_identifiers: true,
344            ignore_string_values: false,
345            ignore_numeric_values: false,
346        };
347        let tokens = vec![
348            make_token(TokenKind::Identifier("foo".to_string())),
349            make_token(TokenKind::Identifier("bar".to_string())),
350        ];
351
352        let hashed = normalize_and_hash_resolved(&tokens, norm);
353        assert_eq!(hashed.len(), 2);
354        assert_eq!(hashed[0].hash, hashed[1].hash);
355    }
356
357    #[test]
358    fn resolved_strict_with_ignore_strings_override() {
359        let norm = ResolvedNormalization {
360            ignore_identifiers: false,
361            ignore_string_values: true,
362            ignore_numeric_values: false,
363        };
364        let tokens = vec![
365            make_token(TokenKind::StringLiteral("hello".to_string())),
366            make_token(TokenKind::StringLiteral("world".to_string())),
367        ];
368
369        let hashed = normalize_and_hash_resolved(&tokens, norm);
370        assert_eq!(hashed[0].hash, hashed[1].hash);
371    }
372
373    #[test]
374    fn resolved_strict_with_ignore_numbers_override() {
375        let norm = ResolvedNormalization {
376            ignore_identifiers: false,
377            ignore_string_values: false,
378            ignore_numeric_values: true,
379        };
380        let tokens = vec![
381            make_token(TokenKind::NumericLiteral("42".to_string())),
382            make_token(TokenKind::NumericLiteral("99".to_string())),
383        ];
384
385        let hashed = normalize_and_hash_resolved(&tokens, norm);
386        assert_eq!(hashed[0].hash, hashed[1].hash);
387    }
388
389    #[test]
390    fn resolved_semantic_with_preserve_identifiers_override() {
391        let norm = ResolvedNormalization {
392            ignore_identifiers: false,
393            ignore_string_values: true,
394            ignore_numeric_values: true,
395        };
396        let tokens = vec![
397            make_token(TokenKind::Identifier("foo".to_string())),
398            make_token(TokenKind::Identifier("bar".to_string())),
399        ];
400
401        let hashed = normalize_and_hash_resolved(&tokens, norm);
402        assert_ne!(hashed[0].hash, hashed[1].hash);
403    }
404
405    #[test]
406    fn resolved_normalization_from_mode_defaults() {
407        use fallow_config::NormalizationConfig;
408
409        let norm =
410            ResolvedNormalization::resolve(DetectionMode::Strict, &NormalizationConfig::default());
411        assert!(!norm.ignore_identifiers);
412        assert!(!norm.ignore_string_values);
413        assert!(!norm.ignore_numeric_values);
414
415        let norm =
416            ResolvedNormalization::resolve(DetectionMode::Weak, &NormalizationConfig::default());
417        assert!(!norm.ignore_identifiers);
418        assert!(norm.ignore_string_values);
419        assert!(!norm.ignore_numeric_values);
420
421        let norm = ResolvedNormalization::resolve(
422            DetectionMode::Semantic,
423            &NormalizationConfig::default(),
424        );
425        assert!(norm.ignore_identifiers);
426        assert!(norm.ignore_string_values);
427        assert!(norm.ignore_numeric_values);
428    }
429
430    #[test]
431    fn resolved_normalization_overrides_mode_defaults() {
432        use fallow_config::NormalizationConfig;
433
434        let overrides = NormalizationConfig {
435            ignore_identifiers: Some(true),
436            ignore_string_values: None,
437            ignore_numeric_values: None,
438        };
439        let norm = ResolvedNormalization::resolve(DetectionMode::Strict, &overrides);
440        assert!(norm.ignore_identifiers);
441        assert!(!norm.ignore_string_values);
442        assert!(!norm.ignore_numeric_values);
443    }
444
445    mod proptests {
446        use super::*;
447        use crate::duplicates::tokenize::{KeywordType, OperatorType, PunctuationType};
448        use oxc_span::Span;
449        use proptest::prelude::*;
450
451        fn make_token(kind: TokenKind) -> SourceToken {
452            SourceToken {
453                kind,
454                span: Span::new(0, 0),
455            }
456        }
457
458        fn arb_detection_mode() -> impl Strategy<Value = DetectionMode> {
459            prop::sample::select(vec![
460                DetectionMode::Strict,
461                DetectionMode::Mild,
462                DetectionMode::Weak,
463                DetectionMode::Semantic,
464            ])
465        }
466
467        fn arb_normalization() -> impl Strategy<Value = ResolvedNormalization> {
468            (any::<bool>(), any::<bool>(), any::<bool>()).prop_map(|(ids, strings, nums)| {
469                ResolvedNormalization {
470                    ignore_identifiers: ids,
471                    ignore_string_values: strings,
472                    ignore_numeric_values: nums,
473                }
474            })
475        }
476
477        fn arb_token_kind() -> impl Strategy<Value = TokenKind> {
478            prop_oneof![
479                Just(TokenKind::Keyword(KeywordType::Const)),
480                Just(TokenKind::Keyword(KeywordType::If)),
481                Just(TokenKind::Keyword(KeywordType::Return)),
482                "[a-zA-Z_][a-zA-Z0-9_]{0,30}".prop_map(TokenKind::Identifier),
483                "[a-zA-Z0-9 _.,!?]{0,50}".prop_map(TokenKind::StringLiteral),
484                "[0-9]{1,10}(\\.[0-9]{1,5})?".prop_map(TokenKind::NumericLiteral),
485                any::<bool>().prop_map(TokenKind::BooleanLiteral),
486                Just(TokenKind::NullLiteral),
487                Just(TokenKind::TemplateLiteral),
488                Just(TokenKind::RegExpLiteral),
489                Just(TokenKind::Operator(OperatorType::Add)),
490                Just(TokenKind::Operator(OperatorType::Assign)),
491                Just(TokenKind::Punctuation(PunctuationType::OpenParen)),
492                Just(TokenKind::Punctuation(PunctuationType::CloseParen)),
493            ]
494        }
495
496        proptest! {
497            /// Normalizing a token twice produces the same result as normalizing once (idempotency).
498            #[test]
499            fn normalization_is_idempotent(
500                kind in arb_token_kind(),
501                norm in arb_normalization(),
502            ) {
503                let token = make_token(kind);
504                let first = normalize_and_hash_resolved(std::slice::from_ref(&token), norm);
505                let second = normalize_and_hash_resolved(&[token], norm);
506                prop_assert_eq!(first.len(), second.len());
507                for (a, b) in first.iter().zip(second.iter()) {
508                    prop_assert_eq!(a.hash, b.hash, "Normalization should be idempotent");
509                }
510            }
511
512            /// Same input always produces the same output (determinism).
513            #[test]
514            fn normalization_is_deterministic(
515                kinds in prop::collection::vec(arb_token_kind(), 1..20),
516                mode in arb_detection_mode(),
517            ) {
518                let tokens: Vec<SourceToken> = kinds.into_iter().map(make_token).collect();
519                let result1 = normalize_and_hash(&tokens, mode);
520                let result2 = normalize_and_hash(&tokens, mode);
521                prop_assert_eq!(result1.len(), result2.len());
522                for (a, b) in result1.iter().zip(result2.iter()) {
523                    prop_assert_eq!(a.hash, b.hash, "Same input must produce same hash");
524                    prop_assert_eq!(a.original_index, b.original_index);
525                }
526            }
527
528            /// Output length always equals input length (no tokens are filtered).
529            #[test]
530            fn output_length_matches_input(
531                kinds in prop::collection::vec(arb_token_kind(), 0..30),
532                mode in arb_detection_mode(),
533            ) {
534                let tokens: Vec<SourceToken> = kinds.into_iter().map(make_token).collect();
535                let result = normalize_and_hash(&tokens, mode);
536                prop_assert_eq!(
537                    result.len(), tokens.len(),
538                    "Output should have same length as input"
539                );
540            }
541
542            /// Original indices should be sequential 0..n.
543            #[test]
544            fn original_indices_are_sequential(
545                kinds in prop::collection::vec(arb_token_kind(), 1..20),
546                norm in arb_normalization(),
547            ) {
548                let tokens: Vec<SourceToken> = kinds.into_iter().map(make_token).collect();
549                let result = normalize_and_hash_resolved(&tokens, norm);
550                for (i, hashed) in result.iter().enumerate() {
551                    prop_assert_eq!(hashed.original_index, i);
552                }
553            }
554        }
555    }
556}