1use xxhash_rust::xxh3::xxh3_64;
2
3use super::tokenize::{SourceToken, TokenKind};
4use fallow_config::DetectionMode;
5
6#[derive(Debug, Clone)]
8pub struct HashedToken {
9 pub hash: u64,
11 pub original_index: usize,
13}
14
15pub fn normalize_and_hash(tokens: &[SourceToken], mode: DetectionMode) -> Vec<HashedToken> {
20 let mut result = Vec::with_capacity(tokens.len());
21
22 for (i, token) in tokens.iter().enumerate() {
23 let normalized = normalize_token(&token.kind, mode);
24 if let Some(hash) = normalized {
25 result.push(HashedToken {
26 hash,
27 original_index: i,
28 });
29 }
30 }
31
32 result
33}
34
35fn normalize_token(kind: &TokenKind, mode: DetectionMode) -> Option<u64> {
38 match mode {
39 DetectionMode::Strict | DetectionMode::Mild => Some(hash_token_strict(kind)),
40 DetectionMode::Weak => Some(hash_token_weak(kind)),
41 DetectionMode::Semantic => Some(hash_token_semantic(kind)),
42 }
43}
44
45fn hash_token_strict(kind: &TokenKind) -> u64 {
47 match kind {
48 TokenKind::Keyword(kw) => hash_bytes(&[0, *kw as u8]),
49 TokenKind::Identifier(name) => {
50 let mut buf = vec![1];
51 buf.extend_from_slice(name.as_bytes());
52 hash_bytes(&buf)
53 }
54 TokenKind::StringLiteral(val) => {
55 let mut buf = vec![2];
56 buf.extend_from_slice(val.as_bytes());
57 hash_bytes(&buf)
58 }
59 TokenKind::NumericLiteral(val) => {
60 let mut buf = vec![3];
61 buf.extend_from_slice(val.as_bytes());
62 hash_bytes(&buf)
63 }
64 TokenKind::BooleanLiteral(val) => hash_bytes(&[4, *val as u8]),
65 TokenKind::NullLiteral => hash_bytes(&[5]),
66 TokenKind::TemplateLiteral => hash_bytes(&[6]),
67 TokenKind::RegExpLiteral => hash_bytes(&[7]),
68 TokenKind::Operator(op) => hash_bytes(&[8, *op as u8]),
69 TokenKind::Punctuation(p) => hash_bytes(&[9, *p as u8]),
70 }
71}
72
73fn hash_token_weak(kind: &TokenKind) -> u64 {
75 match kind {
76 TokenKind::StringLiteral(_) => hash_bytes(&[2, 0]),
78 other => hash_token_strict(other),
79 }
80}
81
82fn hash_token_semantic(kind: &TokenKind) -> u64 {
84 match kind {
85 TokenKind::Keyword(kw) => hash_bytes(&[0, *kw as u8]),
86 TokenKind::Identifier(_) => hash_bytes(&[1, 0]),
88 TokenKind::StringLiteral(_) => hash_bytes(&[2, 0]),
90 TokenKind::NumericLiteral(_) => hash_bytes(&[3, 0]),
92 TokenKind::BooleanLiteral(val) => hash_bytes(&[4, *val as u8]),
94 TokenKind::NullLiteral => hash_bytes(&[5]),
95 TokenKind::TemplateLiteral => hash_bytes(&[6]),
96 TokenKind::RegExpLiteral => hash_bytes(&[7]),
97 TokenKind::Operator(op) => hash_bytes(&[8, *op as u8]),
98 TokenKind::Punctuation(p) => hash_bytes(&[9, *p as u8]),
99 }
100}
101
102fn hash_bytes(data: &[u8]) -> u64 {
104 xxh3_64(data)
105}
106
107#[cfg(test)]
108mod tests {
109 use super::*;
110 use crate::duplicates::tokenize::{KeywordType, OperatorType, PunctuationType};
111 use oxc_span::Span;
112
113 fn make_token(kind: TokenKind) -> SourceToken {
114 SourceToken {
115 kind,
116 span: Span::new(0, 0),
117 }
118 }
119
120 #[test]
121 fn strict_mode_preserves_identifiers() {
122 let tokens = vec![
123 make_token(TokenKind::Identifier("foo".to_string())),
124 make_token(TokenKind::Identifier("bar".to_string())),
125 ];
126
127 let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
128 assert_eq!(hashed.len(), 2);
129 assert_ne!(hashed[0].hash, hashed[1].hash);
131 }
132
133 #[test]
134 fn semantic_mode_blinds_identifiers() {
135 let tokens = vec![
136 make_token(TokenKind::Identifier("foo".to_string())),
137 make_token(TokenKind::Identifier("bar".to_string())),
138 ];
139
140 let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
141 assert_eq!(hashed.len(), 2);
142 assert_eq!(hashed[0].hash, hashed[1].hash);
144 }
145
146 #[test]
147 fn semantic_mode_blinds_string_literals() {
148 let tokens = vec![
149 make_token(TokenKind::StringLiteral("hello".to_string())),
150 make_token(TokenKind::StringLiteral("world".to_string())),
151 ];
152
153 let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
154 assert_eq!(hashed.len(), 2);
155 assert_eq!(hashed[0].hash, hashed[1].hash);
156 }
157
158 #[test]
159 fn semantic_mode_blinds_numeric_literals() {
160 let tokens = vec![
161 make_token(TokenKind::NumericLiteral("42".to_string())),
162 make_token(TokenKind::NumericLiteral("99".to_string())),
163 ];
164
165 let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
166 assert_eq!(hashed.len(), 2);
167 assert_eq!(hashed[0].hash, hashed[1].hash);
168 }
169
170 #[test]
171 fn semantic_mode_preserves_booleans() {
172 let tokens = vec![
173 make_token(TokenKind::BooleanLiteral(true)),
174 make_token(TokenKind::BooleanLiteral(false)),
175 ];
176
177 let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
178 assert_eq!(hashed.len(), 2);
179 assert_ne!(hashed[0].hash, hashed[1].hash);
180 }
181
182 #[test]
183 fn semantic_mode_preserves_keywords() {
184 let tokens = vec![
185 make_token(TokenKind::Keyword(KeywordType::If)),
186 make_token(TokenKind::Keyword(KeywordType::While)),
187 ];
188
189 let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
190 assert_eq!(hashed.len(), 2);
191 assert_ne!(hashed[0].hash, hashed[1].hash);
192 }
193
194 #[test]
195 fn preserves_original_indices() {
196 let tokens = vec![
197 make_token(TokenKind::Keyword(KeywordType::Const)),
198 make_token(TokenKind::Identifier("x".to_string())),
199 make_token(TokenKind::Operator(OperatorType::Assign)),
200 ];
201
202 let hashed = normalize_and_hash(&tokens, DetectionMode::Mild);
203 assert_eq!(hashed.len(), 3);
204 assert_eq!(hashed[0].original_index, 0);
205 assert_eq!(hashed[1].original_index, 1);
206 assert_eq!(hashed[2].original_index, 2);
207 }
208
209 #[test]
210 fn empty_input_produces_empty_output() {
211 let tokens: Vec<SourceToken> = vec![];
212 let hashed = normalize_and_hash(&tokens, DetectionMode::Mild);
213 assert!(hashed.is_empty());
214 }
215
216 #[test]
217 fn operators_have_distinct_hashes() {
218 let tokens = vec![
219 make_token(TokenKind::Operator(OperatorType::Add)),
220 make_token(TokenKind::Operator(OperatorType::Sub)),
221 ];
222
223 let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
224 assert_ne!(hashed[0].hash, hashed[1].hash);
225 }
226
227 #[test]
228 fn punctuation_has_distinct_hashes() {
229 let tokens = vec![
230 make_token(TokenKind::Punctuation(PunctuationType::OpenParen)),
231 make_token(TokenKind::Punctuation(PunctuationType::CloseParen)),
232 ];
233
234 let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
235 assert_ne!(hashed[0].hash, hashed[1].hash);
236 }
237}