1use xxhash_rust::xxh3::xxh3_64;
2
3use super::tokenize::{SourceToken, TokenKind};
4use fallow_config::{DetectionMode, NormalizationConfig, ResolvedNormalization};
5
6#[derive(Debug, Clone)]
8pub struct HashedToken {
9 pub hash: u64,
11 pub original_index: usize,
13}
14
15pub fn normalize_and_hash(tokens: &[SourceToken], mode: DetectionMode) -> Vec<HashedToken> {
20 let resolved = ResolvedNormalization::resolve(mode, &NormalizationConfig::default());
21 normalize_and_hash_resolved(tokens, &resolved)
22}
23
24pub fn normalize_and_hash_resolved(
28 tokens: &[SourceToken],
29 normalization: &ResolvedNormalization,
30) -> Vec<HashedToken> {
31 let mut result = Vec::with_capacity(tokens.len());
32
33 for (i, token) in tokens.iter().enumerate() {
34 let hash = hash_token_resolved(&token.kind, *normalization);
35 result.push(HashedToken {
36 hash,
37 original_index: i,
38 });
39 }
40
41 result
42}
43
44fn hash_token_resolved(kind: &TokenKind, norm: ResolvedNormalization) -> u64 {
46 match kind {
47 TokenKind::Keyword(kw) => hash_bytes(&[0, *kw as u8]),
48 TokenKind::Identifier(name) => {
49 if norm.ignore_identifiers {
50 hash_bytes(&[1, 0])
51 } else {
52 let mut buf = vec![1];
53 buf.extend_from_slice(name.as_bytes());
54 hash_bytes(&buf)
55 }
56 }
57 TokenKind::StringLiteral(val) => {
58 if norm.ignore_string_values {
59 hash_bytes(&[2, 0])
60 } else {
61 let mut buf = vec![2];
62 buf.extend_from_slice(val.as_bytes());
63 hash_bytes(&buf)
64 }
65 }
66 TokenKind::NumericLiteral(val) => {
67 if norm.ignore_numeric_values {
68 hash_bytes(&[3, 0])
69 } else {
70 let mut buf = vec![3];
71 buf.extend_from_slice(val.as_bytes());
72 hash_bytes(&buf)
73 }
74 }
75 TokenKind::BooleanLiteral(val) => hash_bytes(&[4, u8::from(*val)]),
76 TokenKind::NullLiteral => hash_bytes(&[5]),
77 TokenKind::TemplateLiteral => hash_bytes(&[6]),
78 TokenKind::RegExpLiteral => hash_bytes(&[7]),
79 TokenKind::Operator(op) => hash_bytes(&[8, *op as u8]),
80 TokenKind::Punctuation(p) => hash_bytes(&[9, *p as u8]),
81 }
82}
83
84fn hash_bytes(data: &[u8]) -> u64 {
86 xxh3_64(data)
87}
88
89#[cfg(test)]
90mod tests {
91 use super::*;
92 use crate::duplicates::tokenize::{KeywordType, OperatorType, PunctuationType};
93 use oxc_span::Span;
94
95 fn make_token(kind: TokenKind) -> SourceToken {
96 SourceToken {
97 kind,
98 span: Span::new(0, 0),
99 }
100 }
101
102 #[test]
103 fn strict_mode_preserves_identifiers() {
104 let tokens = vec![
105 make_token(TokenKind::Identifier("foo".to_string())),
106 make_token(TokenKind::Identifier("bar".to_string())),
107 ];
108
109 let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
110 assert_eq!(hashed.len(), 2);
111 assert_ne!(hashed[0].hash, hashed[1].hash);
113 }
114
115 #[test]
116 fn semantic_mode_blinds_identifiers() {
117 let tokens = vec![
118 make_token(TokenKind::Identifier("foo".to_string())),
119 make_token(TokenKind::Identifier("bar".to_string())),
120 ];
121
122 let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
123 assert_eq!(hashed.len(), 2);
124 assert_eq!(hashed[0].hash, hashed[1].hash);
126 }
127
128 #[test]
129 fn semantic_mode_blinds_string_literals() {
130 let tokens = vec![
131 make_token(TokenKind::StringLiteral("hello".to_string())),
132 make_token(TokenKind::StringLiteral("world".to_string())),
133 ];
134
135 let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
136 assert_eq!(hashed.len(), 2);
137 assert_eq!(hashed[0].hash, hashed[1].hash);
138 }
139
140 #[test]
141 fn semantic_mode_blinds_numeric_literals() {
142 let tokens = vec![
143 make_token(TokenKind::NumericLiteral("42".to_string())),
144 make_token(TokenKind::NumericLiteral("99".to_string())),
145 ];
146
147 let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
148 assert_eq!(hashed.len(), 2);
149 assert_eq!(hashed[0].hash, hashed[1].hash);
150 }
151
152 #[test]
153 fn semantic_mode_preserves_booleans() {
154 let tokens = vec![
155 make_token(TokenKind::BooleanLiteral(true)),
156 make_token(TokenKind::BooleanLiteral(false)),
157 ];
158
159 let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
160 assert_eq!(hashed.len(), 2);
161 assert_ne!(hashed[0].hash, hashed[1].hash);
162 }
163
164 #[test]
165 fn semantic_mode_preserves_keywords() {
166 let tokens = vec![
167 make_token(TokenKind::Keyword(KeywordType::If)),
168 make_token(TokenKind::Keyword(KeywordType::While)),
169 ];
170
171 let hashed = normalize_and_hash(&tokens, DetectionMode::Semantic);
172 assert_eq!(hashed.len(), 2);
173 assert_ne!(hashed[0].hash, hashed[1].hash);
174 }
175
176 #[test]
177 fn preserves_original_indices() {
178 let tokens = vec![
179 make_token(TokenKind::Keyword(KeywordType::Const)),
180 make_token(TokenKind::Identifier("x".to_string())),
181 make_token(TokenKind::Operator(OperatorType::Assign)),
182 ];
183
184 let hashed = normalize_and_hash(&tokens, DetectionMode::Mild);
185 assert_eq!(hashed.len(), 3);
186 assert_eq!(hashed[0].original_index, 0);
187 assert_eq!(hashed[1].original_index, 1);
188 assert_eq!(hashed[2].original_index, 2);
189 }
190
191 #[test]
192 fn empty_input_produces_empty_output() {
193 let tokens: Vec<SourceToken> = vec![];
194 let hashed = normalize_and_hash(&tokens, DetectionMode::Mild);
195 assert!(hashed.is_empty());
196 }
197
198 #[test]
199 fn operators_have_distinct_hashes() {
200 let tokens = vec![
201 make_token(TokenKind::Operator(OperatorType::Add)),
202 make_token(TokenKind::Operator(OperatorType::Sub)),
203 ];
204
205 let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
206 assert_ne!(hashed[0].hash, hashed[1].hash);
207 }
208
209 #[test]
210 fn punctuation_has_distinct_hashes() {
211 let tokens = vec![
212 make_token(TokenKind::Punctuation(PunctuationType::OpenParen)),
213 make_token(TokenKind::Punctuation(PunctuationType::CloseParen)),
214 ];
215
216 let hashed = normalize_and_hash(&tokens, DetectionMode::Strict);
217 assert_ne!(hashed[0].hash, hashed[1].hash);
218 }
219
220 #[test]
223 fn resolved_strict_with_ignore_identifiers_override() {
224 let norm = ResolvedNormalization {
226 ignore_identifiers: true,
227 ignore_string_values: false,
228 ignore_numeric_values: false,
229 };
230 let tokens = vec![
231 make_token(TokenKind::Identifier("foo".to_string())),
232 make_token(TokenKind::Identifier("bar".to_string())),
233 ];
234
235 let hashed = normalize_and_hash_resolved(&tokens, &norm);
236 assert_eq!(hashed.len(), 2);
237 assert_eq!(hashed[0].hash, hashed[1].hash);
239 }
240
241 #[test]
242 fn resolved_strict_with_ignore_strings_override() {
243 let norm = ResolvedNormalization {
244 ignore_identifiers: false,
245 ignore_string_values: true,
246 ignore_numeric_values: false,
247 };
248 let tokens = vec![
249 make_token(TokenKind::StringLiteral("hello".to_string())),
250 make_token(TokenKind::StringLiteral("world".to_string())),
251 ];
252
253 let hashed = normalize_and_hash_resolved(&tokens, &norm);
254 assert_eq!(hashed[0].hash, hashed[1].hash);
255 }
256
257 #[test]
258 fn resolved_strict_with_ignore_numbers_override() {
259 let norm = ResolvedNormalization {
260 ignore_identifiers: false,
261 ignore_string_values: false,
262 ignore_numeric_values: true,
263 };
264 let tokens = vec![
265 make_token(TokenKind::NumericLiteral("42".to_string())),
266 make_token(TokenKind::NumericLiteral("99".to_string())),
267 ];
268
269 let hashed = normalize_and_hash_resolved(&tokens, &norm);
270 assert_eq!(hashed[0].hash, hashed[1].hash);
271 }
272
273 #[test]
274 fn resolved_semantic_with_preserve_identifiers_override() {
275 let norm = ResolvedNormalization {
277 ignore_identifiers: false,
278 ignore_string_values: true,
279 ignore_numeric_values: true,
280 };
281 let tokens = vec![
282 make_token(TokenKind::Identifier("foo".to_string())),
283 make_token(TokenKind::Identifier("bar".to_string())),
284 ];
285
286 let hashed = normalize_and_hash_resolved(&tokens, &norm);
287 assert_ne!(hashed[0].hash, hashed[1].hash);
289 }
290
291 #[test]
292 fn resolved_normalization_from_mode_defaults() {
293 use fallow_config::NormalizationConfig;
294
295 let norm =
297 ResolvedNormalization::resolve(DetectionMode::Strict, &NormalizationConfig::default());
298 assert!(!norm.ignore_identifiers);
299 assert!(!norm.ignore_string_values);
300 assert!(!norm.ignore_numeric_values);
301
302 let norm =
304 ResolvedNormalization::resolve(DetectionMode::Weak, &NormalizationConfig::default());
305 assert!(!norm.ignore_identifiers);
306 assert!(norm.ignore_string_values);
307 assert!(!norm.ignore_numeric_values);
308
309 let norm = ResolvedNormalization::resolve(
311 DetectionMode::Semantic,
312 &NormalizationConfig::default(),
313 );
314 assert!(norm.ignore_identifiers);
315 assert!(norm.ignore_string_values);
316 assert!(norm.ignore_numeric_values);
317 }
318
319 #[test]
320 fn resolved_normalization_overrides_mode_defaults() {
321 use fallow_config::NormalizationConfig;
322
323 let overrides = NormalizationConfig {
325 ignore_identifiers: Some(true),
326 ignore_string_values: None, ignore_numeric_values: None,
328 };
329 let norm = ResolvedNormalization::resolve(DetectionMode::Strict, &overrides);
330 assert!(norm.ignore_identifiers); assert!(!norm.ignore_string_values); assert!(!norm.ignore_numeric_values); }
334
335 mod proptests {
336 use super::*;
337 use crate::duplicates::tokenize::{KeywordType, OperatorType, PunctuationType};
338 use oxc_span::Span;
339 use proptest::prelude::*;
340
341 fn make_token(kind: TokenKind) -> SourceToken {
342 SourceToken {
343 kind,
344 span: Span::new(0, 0),
345 }
346 }
347
348 fn arb_detection_mode() -> impl Strategy<Value = DetectionMode> {
349 prop::sample::select(vec![
350 DetectionMode::Strict,
351 DetectionMode::Mild,
352 DetectionMode::Weak,
353 DetectionMode::Semantic,
354 ])
355 }
356
357 fn arb_normalization() -> impl Strategy<Value = ResolvedNormalization> {
358 (any::<bool>(), any::<bool>(), any::<bool>()).prop_map(|(ids, strings, nums)| {
359 ResolvedNormalization {
360 ignore_identifiers: ids,
361 ignore_string_values: strings,
362 ignore_numeric_values: nums,
363 }
364 })
365 }
366
367 fn arb_token_kind() -> impl Strategy<Value = TokenKind> {
368 prop_oneof![
369 Just(TokenKind::Keyword(KeywordType::Const)),
370 Just(TokenKind::Keyword(KeywordType::If)),
371 Just(TokenKind::Keyword(KeywordType::Return)),
372 "[a-zA-Z_][a-zA-Z0-9_]{0,30}".prop_map(TokenKind::Identifier),
373 "[a-zA-Z0-9 _.,!?]{0,50}".prop_map(TokenKind::StringLiteral),
374 "[0-9]{1,10}(\\.[0-9]{1,5})?".prop_map(TokenKind::NumericLiteral),
375 any::<bool>().prop_map(TokenKind::BooleanLiteral),
376 Just(TokenKind::NullLiteral),
377 Just(TokenKind::TemplateLiteral),
378 Just(TokenKind::RegExpLiteral),
379 Just(TokenKind::Operator(OperatorType::Add)),
380 Just(TokenKind::Operator(OperatorType::Assign)),
381 Just(TokenKind::Punctuation(PunctuationType::OpenParen)),
382 Just(TokenKind::Punctuation(PunctuationType::CloseParen)),
383 ]
384 }
385
386 proptest! {
387 #[test]
389 fn normalization_is_idempotent(
390 kind in arb_token_kind(),
391 norm in arb_normalization(),
392 ) {
393 let token = make_token(kind);
394 let first = normalize_and_hash_resolved(&[token.clone()], &norm);
395 let second = normalize_and_hash_resolved(&[token], &norm);
398 prop_assert_eq!(first.len(), second.len());
399 for (a, b) in first.iter().zip(second.iter()) {
400 prop_assert_eq!(a.hash, b.hash, "Normalization should be idempotent");
401 }
402 }
403
404 #[test]
406 fn normalization_is_deterministic(
407 kinds in prop::collection::vec(arb_token_kind(), 1..20),
408 mode in arb_detection_mode(),
409 ) {
410 let tokens: Vec<SourceToken> = kinds.into_iter().map(make_token).collect();
411 let result1 = normalize_and_hash(&tokens, mode);
412 let result2 = normalize_and_hash(&tokens, mode);
413 prop_assert_eq!(result1.len(), result2.len());
414 for (a, b) in result1.iter().zip(result2.iter()) {
415 prop_assert_eq!(a.hash, b.hash, "Same input must produce same hash");
416 prop_assert_eq!(a.original_index, b.original_index);
417 }
418 }
419
420 #[test]
422 fn output_length_matches_input(
423 kinds in prop::collection::vec(arb_token_kind(), 0..30),
424 mode in arb_detection_mode(),
425 ) {
426 let tokens: Vec<SourceToken> = kinds.into_iter().map(make_token).collect();
427 let result = normalize_and_hash(&tokens, mode);
428 prop_assert_eq!(
429 result.len(), tokens.len(),
430 "Output should have same length as input"
431 );
432 }
433
434 #[test]
436 fn original_indices_are_sequential(
437 kinds in prop::collection::vec(arb_token_kind(), 1..20),
438 norm in arb_normalization(),
439 ) {
440 let tokens: Vec<SourceToken> = kinds.into_iter().map(make_token).collect();
441 let result = normalize_and_hash_resolved(&tokens, &norm);
442 for (i, hashed) in result.iter().enumerate() {
443 prop_assert_eq!(hashed.original_index, i);
444 }
445 }
446 }
447 }
448}