Skip to main content

fallow_core/duplicates/
normalize.rs

1use xxhash_rust::xxh3::xxh3_64;
2
3use super::tokenize::{SourceToken, TokenKind};
4use fallow_config::DetectionMode;
5
6/// A token with a precomputed hash for use in the detection engine.
7#[derive(Debug, Clone)]
8pub struct HashedToken {
9    /// Hash of the normalized token.
10    pub hash: u64,
11    /// Index of this token in the original (pre-normalization) token sequence.
12    pub original_index: usize,
13}
14
15/// Normalize and hash a token sequence according to the detection mode.
16///
17/// Returns a vector of `HashedToken` values ready for the Rabin-Karp sliding window.
18/// Tokens that should be skipped (based on mode) are excluded from the output.
19pub fn normalize_and_hash(tokens: &[SourceToken], mode: DetectionMode) -> Vec<HashedToken> {
20    let mut result = Vec::with_capacity(tokens.len());
21
22    for (i, token) in tokens.iter().enumerate() {
23        let normalized = normalize_token(&token.kind, mode);
24        if let Some(hash) = normalized {
25            result.push(HashedToken {
26                hash,
27                original_index: i,
28            });
29        }
30    }
31
32    result
33}
34
35/// Normalize a single token and compute its hash.
36/// Returns `None` if the token should be skipped in the given mode.
37fn normalize_token(kind: &TokenKind, mode: DetectionMode) -> Option<u64> {
38    match mode {
39        DetectionMode::Strict | DetectionMode::Mild => Some(hash_token_strict(kind)),
40        DetectionMode::Weak => Some(hash_token_weak(kind)),
41        DetectionMode::Semantic => Some(hash_token_semantic(kind)),
42    }
43}
44
45/// Hash a token preserving its full identity (strict/mild/weak modes).
46fn hash_token_strict(kind: &TokenKind) -> u64 {
47    match kind {
48        TokenKind::Keyword(kw) => hash_bytes(&[0, *kw as u8]),
49        TokenKind::Identifier(name) => {
50            let mut buf = vec![1];
51            buf.extend_from_slice(name.as_bytes());
52            hash_bytes(&buf)
53        }
54        TokenKind::StringLiteral(val) => {
55            let mut buf = vec![2];
56            buf.extend_from_slice(val.as_bytes());
57            hash_bytes(&buf)
58        }
59        TokenKind::NumericLiteral(val) => {
60            let mut buf = vec![3];
61            buf.extend_from_slice(val.as_bytes());
62            hash_bytes(&buf)
63        }
64        TokenKind::BooleanLiteral(val) => hash_bytes(&[4, *val as u8]),
65        TokenKind::NullLiteral => hash_bytes(&[5]),
66        TokenKind::TemplateLiteral => hash_bytes(&[6]),
67        TokenKind::RegExpLiteral => hash_bytes(&[7]),
68        TokenKind::Operator(op) => hash_bytes(&[8, *op as u8]),
69        TokenKind::Punctuation(p) => hash_bytes(&[9, *p as u8]),
70    }
71}
72
73/// Hash a token with string literals blinded (weak mode).
74fn hash_token_weak(kind: &TokenKind) -> u64 {
75    match kind {
76        // Blind string literals only — keep identifiers and numeric literals
77        TokenKind::StringLiteral(_) => hash_bytes(&[2, 0]),
78        other => hash_token_strict(other),
79    }
80}
81
82/// Hash a token with identifiers and literals blinded (semantic mode).
83fn hash_token_semantic(kind: &TokenKind) -> u64 {
84    match kind {
85        TokenKind::Keyword(kw) => hash_bytes(&[0, *kw as u8]),
86        // All identifiers map to the same hash
87        TokenKind::Identifier(_) => hash_bytes(&[1, 0]),
88        // All string literals map to the same hash
89        TokenKind::StringLiteral(_) => hash_bytes(&[2, 0]),
90        // All numeric literals map to the same hash
91        TokenKind::NumericLiteral(_) => hash_bytes(&[3, 0]),
92        // Booleans are kept as-is (structurally significant)
93        TokenKind::BooleanLiteral(val) => hash_bytes(&[4, *val as u8]),
94        TokenKind::NullLiteral => hash_bytes(&[5]),
95        TokenKind::TemplateLiteral => hash_bytes(&[6]),
96        TokenKind::RegExpLiteral => hash_bytes(&[7]),
97        TokenKind::Operator(op) => hash_bytes(&[8, *op as u8]),
98        TokenKind::Punctuation(p) => hash_bytes(&[9, *p as u8]),
99    }
100}
101
102/// Hash a byte slice using xxh3.
103fn hash_bytes(data: &[u8]) -> u64 {
104    xxh3_64(data)
105}
106
107#[cfg(test)]
108mod tests {
109    use super::*;
110    use crate::duplicates::tokenize::{KeywordType, OperatorType, PunctuationType};
111    use oxc_span::Span;
112
113    fn make_token(kind: TokenKind) -> SourceToken {
114        SourceToken {
115            kind,
116            span: Span::new(0, 0),
117        }
118    }
119
120    #[test]
121    fn strict_mode_preserves_identifiers() {
122        let tokens = vec![
123            make_token(TokenKind::Identifier("foo".to_string())),
124            make_token(TokenKind::Identifier("bar".to_string())),
125        ];
126
127        let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
128        assert_eq!(hashed.len(), 2);
129        // Different identifiers should have different hashes in strict mode
130        assert_ne!(hashed[0].hash, hashed[1].hash);
131    }
132
133    #[test]
134    fn semantic_mode_blinds_identifiers() {
135        let tokens = vec![
136            make_token(TokenKind::Identifier("foo".to_string())),
137            make_token(TokenKind::Identifier("bar".to_string())),
138        ];
139
140        let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
141        assert_eq!(hashed.len(), 2);
142        // Different identifiers should have the SAME hash in semantic mode
143        assert_eq!(hashed[0].hash, hashed[1].hash);
144    }
145
146    #[test]
147    fn semantic_mode_blinds_string_literals() {
148        let tokens = vec![
149            make_token(TokenKind::StringLiteral("hello".to_string())),
150            make_token(TokenKind::StringLiteral("world".to_string())),
151        ];
152
153        let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
154        assert_eq!(hashed.len(), 2);
155        assert_eq!(hashed[0].hash, hashed[1].hash);
156    }
157
158    #[test]
159    fn semantic_mode_blinds_numeric_literals() {
160        let tokens = vec![
161            make_token(TokenKind::NumericLiteral("42".to_string())),
162            make_token(TokenKind::NumericLiteral("99".to_string())),
163        ];
164
165        let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
166        assert_eq!(hashed.len(), 2);
167        assert_eq!(hashed[0].hash, hashed[1].hash);
168    }
169
170    #[test]
171    fn semantic_mode_preserves_booleans() {
172        let tokens = vec![
173            make_token(TokenKind::BooleanLiteral(true)),
174            make_token(TokenKind::BooleanLiteral(false)),
175        ];
176
177        let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
178        assert_eq!(hashed.len(), 2);
179        assert_ne!(hashed[0].hash, hashed[1].hash);
180    }
181
182    #[test]
183    fn semantic_mode_preserves_keywords() {
184        let tokens = vec![
185            make_token(TokenKind::Keyword(KeywordType::If)),
186            make_token(TokenKind::Keyword(KeywordType::While)),
187        ];
188
189        let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
190        assert_eq!(hashed.len(), 2);
191        assert_ne!(hashed[0].hash, hashed[1].hash);
192    }
193
194    #[test]
195    fn preserves_original_indices() {
196        let tokens = vec![
197            make_token(TokenKind::Keyword(KeywordType::Const)),
198            make_token(TokenKind::Identifier("x".to_string())),
199            make_token(TokenKind::Operator(OperatorType::Assign)),
200        ];
201
202        let hashed = normalize_and_hash(&tokens, DetectionMode::Mild);
203        assert_eq!(hashed.len(), 3);
204        assert_eq!(hashed[0].original_index, 0);
205        assert_eq!(hashed[1].original_index, 1);
206        assert_eq!(hashed[2].original_index, 2);
207    }
208
209    #[test]
210    fn empty_input_produces_empty_output() {
211        let tokens: Vec<SourceToken> = vec![];
212        let hashed = normalize_and_hash(&tokens, DetectionMode::Mild);
213        assert!(hashed.is_empty());
214    }
215
216    #[test]
217    fn operators_have_distinct_hashes() {
218        let tokens = vec![
219            make_token(TokenKind::Operator(OperatorType::Add)),
220            make_token(TokenKind::Operator(OperatorType::Sub)),
221        ];
222
223        let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
224        assert_ne!(hashed[0].hash, hashed[1].hash);
225    }
226
227    #[test]
228    fn punctuation_has_distinct_hashes() {
229        let tokens = vec![
230            make_token(TokenKind::Punctuation(PunctuationType::OpenParen)),
231            make_token(TokenKind::Punctuation(PunctuationType::CloseParen)),
232        ];
233
234        let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
235        assert_ne!(hashed[0].hash, hashed[1].hash);
236    }
237}