Skip to main content

fallow_core/duplicates/
normalize.rs

1use xxhash_rust::xxh3::xxh3_64;
2
3use super::tokenize::{SourceToken, TokenKind};
4use fallow_config::{DetectionMode, NormalizationConfig, ResolvedNormalization};
5
6/// A token with a precomputed hash for use in the detection engine.
7#[derive(Debug, Clone)]
8pub struct HashedToken {
9    /// Hash of the normalized token.
10    pub hash: u64,
11    /// Index of this token in the original (pre-normalization) token sequence.
12    pub original_index: usize,
13}
14
15/// Normalize and hash a token sequence according to the detection mode.
16///
17/// Returns a vector of `HashedToken` values ready for the Rabin-Karp sliding window.
18/// Tokens that should be skipped (based on mode) are excluded from the output.
19pub fn normalize_and_hash(tokens: &[SourceToken], mode: DetectionMode) -> Vec<HashedToken> {
20    let resolved = ResolvedNormalization::resolve(mode, &NormalizationConfig::default());
21    normalize_and_hash_resolved(tokens, &resolved)
22}
23
24/// Normalize and hash with explicit resolved normalization flags.
25///
26/// This is the primary normalization entry point when using configurable overrides.
27pub fn normalize_and_hash_resolved(
28    tokens: &[SourceToken],
29    normalization: &ResolvedNormalization,
30) -> Vec<HashedToken> {
31    let mut result = Vec::with_capacity(tokens.len());
32
33    for (i, token) in tokens.iter().enumerate() {
34        let hash = hash_token_resolved(&token.kind, *normalization);
35        result.push(HashedToken {
36            hash,
37            original_index: i,
38        });
39    }
40
41    result
42}
43
44/// Hash a single token using resolved normalization flags.
45fn hash_token_resolved(kind: &TokenKind, norm: ResolvedNormalization) -> u64 {
46    match kind {
47        TokenKind::Keyword(kw) => hash_bytes(&[0, *kw as u8]),
48        TokenKind::Identifier(name) => {
49            if norm.ignore_identifiers {
50                hash_bytes(&[1, 0])
51            } else {
52                let mut buf = vec![1];
53                buf.extend_from_slice(name.as_bytes());
54                hash_bytes(&buf)
55            }
56        }
57        TokenKind::StringLiteral(val) => {
58            if norm.ignore_string_values {
59                hash_bytes(&[2, 0])
60            } else {
61                let mut buf = vec![2];
62                buf.extend_from_slice(val.as_bytes());
63                hash_bytes(&buf)
64            }
65        }
66        TokenKind::NumericLiteral(val) => {
67            if norm.ignore_numeric_values {
68                hash_bytes(&[3, 0])
69            } else {
70                let mut buf = vec![3];
71                buf.extend_from_slice(val.as_bytes());
72                hash_bytes(&buf)
73            }
74        }
75        TokenKind::BooleanLiteral(val) => hash_bytes(&[4, u8::from(*val)]),
76        TokenKind::NullLiteral => hash_bytes(&[5]),
77        TokenKind::TemplateLiteral => hash_bytes(&[6]),
78        TokenKind::RegExpLiteral => hash_bytes(&[7]),
79        TokenKind::Operator(op) => hash_bytes(&[8, *op as u8]),
80        TokenKind::Punctuation(p) => hash_bytes(&[9, *p as u8]),
81    }
82}
83
84/// Hash a byte slice using xxh3.
85fn hash_bytes(data: &[u8]) -> u64 {
86    xxh3_64(data)
87}
88
89#[cfg(test)]
90mod tests {
91    use super::*;
92    use crate::duplicates::tokenize::{KeywordType, OperatorType, PunctuationType};
93    use oxc_span::Span;
94
95    fn make_token(kind: TokenKind) -> SourceToken {
96        SourceToken {
97            kind,
98            span: Span::new(0, 0),
99        }
100    }
101
102    #[test]
103    fn strict_mode_preserves_identifiers() {
104        let tokens = vec![
105            make_token(TokenKind::Identifier("foo".to_string())),
106            make_token(TokenKind::Identifier("bar".to_string())),
107        ];
108
109        let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
110        assert_eq!(hashed.len(), 2);
111        // Different identifiers should have different hashes in strict mode
112        assert_ne!(hashed[0].hash, hashed[1].hash);
113    }
114
115    #[test]
116    fn semantic_mode_blinds_identifiers() {
117        let tokens = vec![
118            make_token(TokenKind::Identifier("foo".to_string())),
119            make_token(TokenKind::Identifier("bar".to_string())),
120        ];
121
122        let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
123        assert_eq!(hashed.len(), 2);
124        // Different identifiers should have the SAME hash in semantic mode
125        assert_eq!(hashed[0].hash, hashed[1].hash);
126    }
127
128    #[test]
129    fn semantic_mode_blinds_string_literals() {
130        let tokens = vec![
131            make_token(TokenKind::StringLiteral("hello".to_string())),
132            make_token(TokenKind::StringLiteral("world".to_string())),
133        ];
134
135        let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
136        assert_eq!(hashed.len(), 2);
137        assert_eq!(hashed[0].hash, hashed[1].hash);
138    }
139
140    #[test]
141    fn semantic_mode_blinds_numeric_literals() {
142        let tokens = vec![
143            make_token(TokenKind::NumericLiteral("42".to_string())),
144            make_token(TokenKind::NumericLiteral("99".to_string())),
145        ];
146
147        let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
148        assert_eq!(hashed.len(), 2);
149        assert_eq!(hashed[0].hash, hashed[1].hash);
150    }
151
152    #[test]
153    fn semantic_mode_preserves_booleans() {
154        let tokens = vec![
155            make_token(TokenKind::BooleanLiteral(true)),
156            make_token(TokenKind::BooleanLiteral(false)),
157        ];
158
159        let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
160        assert_eq!(hashed.len(), 2);
161        assert_ne!(hashed[0].hash, hashed[1].hash);
162    }
163
164    #[test]
165    fn semantic_mode_preserves_keywords() {
166        let tokens = vec![
167            make_token(TokenKind::Keyword(KeywordType::If)),
168            make_token(TokenKind::Keyword(KeywordType::While)),
169        ];
170
171        let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
172        assert_eq!(hashed.len(), 2);
173        assert_ne!(hashed[0].hash, hashed[1].hash);
174    }
175
176    #[test]
177    fn preserves_original_indices() {
178        let tokens = vec![
179            make_token(TokenKind::Keyword(KeywordType::Const)),
180            make_token(TokenKind::Identifier("x".to_string())),
181            make_token(TokenKind::Operator(OperatorType::Assign)),
182        ];
183
184        let hashed = normalize_and_hash(&tokens, DetectionMode::Mild);
185        assert_eq!(hashed.len(), 3);
186        assert_eq!(hashed[0].original_index, 0);
187        assert_eq!(hashed[1].original_index, 1);
188        assert_eq!(hashed[2].original_index, 2);
189    }
190
191    #[test]
192    fn empty_input_produces_empty_output() {
193        let tokens: Vec<SourceToken> = vec![];
194        let hashed = normalize_and_hash(&tokens, DetectionMode::Mild);
195        assert!(hashed.is_empty());
196    }
197
198    #[test]
199    fn operators_have_distinct_hashes() {
200        let tokens = vec![
201            make_token(TokenKind::Operator(OperatorType::Add)),
202            make_token(TokenKind::Operator(OperatorType::Sub)),
203        ];
204
205        let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
206        assert_ne!(hashed[0].hash, hashed[1].hash);
207    }
208
209    #[test]
210    fn punctuation_has_distinct_hashes() {
211        let tokens = vec![
212            make_token(TokenKind::Punctuation(PunctuationType::OpenParen)),
213            make_token(TokenKind::Punctuation(PunctuationType::CloseParen)),
214        ];
215
216        let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
217        assert_ne!(hashed[0].hash, hashed[1].hash);
218    }
219
220    // ── Configurable normalization tests ──────────────────────────
221
222    #[test]
223    fn resolved_strict_with_ignore_identifiers_override() {
224        // Strict mode normally preserves identifiers, but override blinds them
225        let norm = ResolvedNormalization {
226            ignore_identifiers: true,
227            ignore_string_values: false,
228            ignore_numeric_values: false,
229        };
230        let tokens = vec![
231            make_token(TokenKind::Identifier("foo".to_string())),
232            make_token(TokenKind::Identifier("bar".to_string())),
233        ];
234
235        let hashed = normalize_and_hash_resolved(&tokens, &norm);
236        assert_eq!(hashed.len(), 2);
237        // Identifiers should be blinded
238        assert_eq!(hashed[0].hash, hashed[1].hash);
239    }
240
241    #[test]
242    fn resolved_strict_with_ignore_strings_override() {
243        let norm = ResolvedNormalization {
244            ignore_identifiers: false,
245            ignore_string_values: true,
246            ignore_numeric_values: false,
247        };
248        let tokens = vec![
249            make_token(TokenKind::StringLiteral("hello".to_string())),
250            make_token(TokenKind::StringLiteral("world".to_string())),
251        ];
252
253        let hashed = normalize_and_hash_resolved(&tokens, &norm);
254        assert_eq!(hashed[0].hash, hashed[1].hash);
255    }
256
257    #[test]
258    fn resolved_strict_with_ignore_numbers_override() {
259        let norm = ResolvedNormalization {
260            ignore_identifiers: false,
261            ignore_string_values: false,
262            ignore_numeric_values: true,
263        };
264        let tokens = vec![
265            make_token(TokenKind::NumericLiteral("42".to_string())),
266            make_token(TokenKind::NumericLiteral("99".to_string())),
267        ];
268
269        let hashed = normalize_and_hash_resolved(&tokens, &norm);
270        assert_eq!(hashed[0].hash, hashed[1].hash);
271    }
272
273    #[test]
274    fn resolved_semantic_with_preserve_identifiers_override() {
275        // Semantic mode normally blinds identifiers, but override preserves them
276        let norm = ResolvedNormalization {
277            ignore_identifiers: false,
278            ignore_string_values: true,
279            ignore_numeric_values: true,
280        };
281        let tokens = vec![
282            make_token(TokenKind::Identifier("foo".to_string())),
283            make_token(TokenKind::Identifier("bar".to_string())),
284        ];
285
286        let hashed = normalize_and_hash_resolved(&tokens, &norm);
287        // Identifiers should be preserved (different hashes)
288        assert_ne!(hashed[0].hash, hashed[1].hash);
289    }
290
291    #[test]
292    fn resolved_normalization_from_mode_defaults() {
293        use fallow_config::NormalizationConfig;
294
295        // Strict mode defaults: preserve everything
296        let norm =
297            ResolvedNormalization::resolve(DetectionMode::Strict, &NormalizationConfig::default());
298        assert!(!norm.ignore_identifiers);
299        assert!(!norm.ignore_string_values);
300        assert!(!norm.ignore_numeric_values);
301
302        // Weak mode defaults: blind strings only
303        let norm =
304            ResolvedNormalization::resolve(DetectionMode::Weak, &NormalizationConfig::default());
305        assert!(!norm.ignore_identifiers);
306        assert!(norm.ignore_string_values);
307        assert!(!norm.ignore_numeric_values);
308
309        // Semantic mode defaults: blind all
310        let norm = ResolvedNormalization::resolve(
311            DetectionMode::Semantic,
312            &NormalizationConfig::default(),
313        );
314        assert!(norm.ignore_identifiers);
315        assert!(norm.ignore_string_values);
316        assert!(norm.ignore_numeric_values);
317    }
318
319    #[test]
320    fn resolved_normalization_overrides_mode_defaults() {
321        use fallow_config::NormalizationConfig;
322
323        // Strict mode with explicit override to blind identifiers
324        let overrides = NormalizationConfig {
325            ignore_identifiers: Some(true),
326            ignore_string_values: None, // Use mode default (false)
327            ignore_numeric_values: None,
328        };
329        let norm = ResolvedNormalization::resolve(DetectionMode::Strict, &overrides);
330        assert!(norm.ignore_identifiers); // Overridden
331        assert!(!norm.ignore_string_values); // Mode default
332        assert!(!norm.ignore_numeric_values); // Mode default
333    }
334
335    mod proptests {
336        use super::*;
337        use crate::duplicates::tokenize::{KeywordType, OperatorType, PunctuationType};
338        use oxc_span::Span;
339        use proptest::prelude::*;
340
341        fn make_token(kind: TokenKind) -> SourceToken {
342            SourceToken {
343                kind,
344                span: Span::new(0, 0),
345            }
346        }
347
348        fn arb_detection_mode() -> impl Strategy<Value = DetectionMode> {
349            prop::sample::select(vec![
350                DetectionMode::Strict,
351                DetectionMode::Mild,
352                DetectionMode::Weak,
353                DetectionMode::Semantic,
354            ])
355        }
356
357        fn arb_normalization() -> impl Strategy<Value = ResolvedNormalization> {
358            (any::<bool>(), any::<bool>(), any::<bool>()).prop_map(|(ids, strings, nums)| {
359                ResolvedNormalization {
360                    ignore_identifiers: ids,
361                    ignore_string_values: strings,
362                    ignore_numeric_values: nums,
363                }
364            })
365        }
366
367        fn arb_token_kind() -> impl Strategy<Value = TokenKind> {
368            prop_oneof![
369                Just(TokenKind::Keyword(KeywordType::Const)),
370                Just(TokenKind::Keyword(KeywordType::If)),
371                Just(TokenKind::Keyword(KeywordType::Return)),
372                "[a-zA-Z_][a-zA-Z0-9_]{0,30}".prop_map(TokenKind::Identifier),
373                "[a-zA-Z0-9 _.,!?]{0,50}".prop_map(TokenKind::StringLiteral),
374                "[0-9]{1,10}(\\.[0-9]{1,5})?".prop_map(TokenKind::NumericLiteral),
375                any::<bool>().prop_map(TokenKind::BooleanLiteral),
376                Just(TokenKind::NullLiteral),
377                Just(TokenKind::TemplateLiteral),
378                Just(TokenKind::RegExpLiteral),
379                Just(TokenKind::Operator(OperatorType::Add)),
380                Just(TokenKind::Operator(OperatorType::Assign)),
381                Just(TokenKind::Punctuation(PunctuationType::OpenParen)),
382                Just(TokenKind::Punctuation(PunctuationType::CloseParen)),
383            ]
384        }
385
386        proptest! {
387            /// Normalizing a token twice produces the same result as normalizing once (idempotency).
388            #[test]
389            fn normalization_is_idempotent(
390                kind in arb_token_kind(),
391                norm in arb_normalization(),
392            ) {
393                let token = make_token(kind);
394                let first = normalize_and_hash_resolved(&[token.clone()], &norm);
395                // The hash is computed directly from the token kind + normalization flags.
396                // Running it again on the same input must yield the same hash.
397                let second = normalize_and_hash_resolved(&[token], &norm);
398                prop_assert_eq!(first.len(), second.len());
399                for (a, b) in first.iter().zip(second.iter()) {
400                    prop_assert_eq!(a.hash, b.hash, "Normalization should be idempotent");
401                }
402            }
403
404            /// Same input always produces the same output (determinism).
405            #[test]
406            fn normalization_is_deterministic(
407                kinds in prop::collection::vec(arb_token_kind(), 1..20),
408                mode in arb_detection_mode(),
409            ) {
410                let tokens: Vec<SourceToken> = kinds.into_iter().map(make_token).collect();
411                let result1 = normalize_and_hash(&tokens, mode);
412                let result2 = normalize_and_hash(&tokens, mode);
413                prop_assert_eq!(result1.len(), result2.len());
414                for (a, b) in result1.iter().zip(result2.iter()) {
415                    prop_assert_eq!(a.hash, b.hash, "Same input must produce same hash");
416                    prop_assert_eq!(a.original_index, b.original_index);
417                }
418            }
419
420            /// Output length always equals input length (no tokens are filtered).
421            #[test]
422            fn output_length_matches_input(
423                kinds in prop::collection::vec(arb_token_kind(), 0..30),
424                mode in arb_detection_mode(),
425            ) {
426                let tokens: Vec<SourceToken> = kinds.into_iter().map(make_token).collect();
427                let result = normalize_and_hash(&tokens, mode);
428                prop_assert_eq!(
429                    result.len(), tokens.len(),
430                    "Output should have same length as input"
431                );
432            }
433
434            /// Original indices should be sequential 0..n.
435            #[test]
436            fn original_indices_are_sequential(
437                kinds in prop::collection::vec(arb_token_kind(), 1..20),
438                norm in arb_normalization(),
439            ) {
440                let tokens: Vec<SourceToken> = kinds.into_iter().map(make_token).collect();
441                let result = normalize_and_hash_resolved(&tokens, &norm);
442                for (i, hashed) in result.iter().enumerate() {
443                    prop_assert_eq!(hashed.original_index, i);
444                }
445            }
446        }
447    }
448}