1use std::num::Wrapping;
9
10#[derive(Debug, Clone, PartialEq, Eq, Hash)]
15pub enum Token {
16 Keyword(String),
18 Identifier,
20 StringLiteral,
22 NumberLiteral,
24 Operator(String),
26 Punctuation(String),
28}
29
30impl Token {
31 pub fn as_hash_string(&self) -> &str {
33 match self {
34 Token::Keyword(kw) => kw.as_str(),
35 Token::Identifier => "$$ID",
36 Token::StringLiteral => "$$STR",
37 Token::NumberLiteral => "$$NUM",
38 Token::Operator(op) => op.as_str(),
39 Token::Punctuation(p) => p.as_str(),
40 }
41 }
42}
43
44pub fn normalize(code: &str) -> Vec<Token> {
60 let keywords = get_keyword_set();
61 let estimated_tokens = code.len() / 3;
63 let mut tokens = Vec::with_capacity(estimated_tokens);
64 let chars: Vec<char> = code.chars().collect();
65 let mut i = 0;
66
67 while i < chars.len() {
68 let ch = chars[i];
69
70 if ch.is_whitespace() {
72 i += 1;
73 continue;
74 }
75
76 if (ch == '/' && i + 1 < chars.len() && chars[i + 1] == '/') || ch == '#' {
78 while i < chars.len() && chars[i] != '\n' {
80 i += 1;
81 }
82 continue;
83 }
84
85 if ch == '/' && i + 1 < chars.len() && chars[i + 1] == '*' {
87 i += 2;
88 while i + 1 < chars.len() {
89 if chars[i] == '*' && chars[i + 1] == '/' {
90 i += 2;
91 break;
92 }
93 i += 1;
94 }
95 continue;
96 }
97
98 if ch == '"' || ch == '\'' {
100 let quote = ch;
101 i += 1;
102 while i < chars.len() {
104 if chars[i] == '\\' {
105 i += 2; continue;
107 }
108 if chars[i] == quote {
109 i += 1;
110 break;
111 }
112 i += 1;
113 }
114 tokens.push(Token::StringLiteral);
115 continue;
116 }
117
118 if ch.is_ascii_digit() {
120 while i < chars.len() && (chars[i].is_ascii_alphanumeric() || chars[i] == '.') {
121 i += 1;
122 }
123 tokens.push(Token::NumberLiteral);
124 continue;
125 }
126
127 if ch.is_alphabetic() || ch == '_' || ch == '$' {
129 let start = i;
130 while i < chars.len()
131 && (chars[i].is_alphanumeric() || chars[i] == '_' || chars[i] == '$')
132 {
133 i += 1;
134 }
135 let word: String = chars[start..i].iter().collect();
136
137 if keywords.contains(&word.as_str()) {
138 tokens.push(Token::Keyword(word));
139 } else {
140 tokens.push(Token::Identifier);
141 }
142 continue;
143 }
144
145 if i + 1 < chars.len() {
147 let two_char: String = chars[i..i + 2].iter().collect();
148 if is_operator(&two_char) {
149 tokens.push(Token::Operator(two_char));
150 i += 2;
151 continue;
152 }
153 }
154
155 let single = ch.to_string();
157 if is_operator(&single) {
158 tokens.push(Token::Operator(single));
159 } else if is_punctuation(ch) {
160 tokens.push(Token::Punctuation(single));
161 }
162
163 i += 1;
164 }
165
166 tokens
167}
168
169#[derive(Debug, Clone)]
174pub struct RollingHash {
175 window_size: usize,
177 base: Wrapping<u64>,
179 hash: Wrapping<u64>,
181 base_power: Wrapping<u64>,
183 window: Vec<u64>,
185}
186
187impl RollingHash {
188 pub fn new(window_size: usize) -> Self {
193 let base = Wrapping(257u64);
194 let mut base_power = Wrapping(1u64);
195
196 for _ in 0..window_size {
198 base_power *= base;
199 }
200
201 Self {
202 window_size,
203 base,
204 hash: Wrapping(0),
205 base_power,
206 window: Vec::with_capacity(window_size),
207 }
208 }
209
210 pub fn roll(&mut self, token_hash: u64) -> Option<u64> {
220 if self.window.len() < self.window_size {
221 self.window.push(token_hash);
223 self.hash = self.hash * self.base + Wrapping(token_hash);
224
225 if self.window.len() == self.window_size {
226 Some(self.hash.0)
227 } else {
228 None
229 }
230 } else {
231 let old_token = self.window.remove(0);
233 self.window.push(token_hash);
234
235 self.hash -= Wrapping(old_token) * self.base_power;
237 self.hash = self.hash * self.base + Wrapping(token_hash);
239
240 Some(self.hash.0)
241 }
242 }
243
244 pub fn reset(&mut self) {
246 self.hash = Wrapping(0);
247 self.window.clear();
248 }
249
250 pub fn current_hash(&self) -> Option<u64> {
252 if self.window.len() == self.window_size {
253 Some(self.hash.0)
254 } else {
255 None
256 }
257 }
258
259 pub fn window_size(&self) -> usize {
261 self.window_size
262 }
263}
264
265pub fn compute_rolling_hashes(tokens: &[Token], window_size: usize) -> Vec<(u64, usize)> {
274 if tokens.len() < window_size {
275 return Vec::new();
276 }
277
278 let mut hasher = RollingHash::new(window_size);
279 let mut hashes = Vec::new();
280
281 for (idx, token) in tokens.iter().enumerate() {
282 let token_hash = hash_token(token);
283 if let Some(hash) = hasher.roll(token_hash) {
284 let start_index = idx.saturating_sub(window_size - 1);
287 hashes.push((hash, start_index));
288 }
289 }
290
291 hashes
292}
293
294fn hash_token(token: &Token) -> u64 {
296 use std::collections::hash_map::DefaultHasher;
297 use std::hash::{Hash, Hasher};
298
299 let mut hasher = DefaultHasher::new();
300 token.as_hash_string().hash(&mut hasher);
301 hasher.finish()
302}
303
304#[derive(Debug, Clone)]
306pub struct CloneMatch {
307 pub source_start: usize,
308 pub target_start: usize,
309 pub length: usize,
310}
311
312pub fn detect_duplicates_with_extension(tokens: &[Token], window_size: usize) -> Vec<CloneMatch> {
327 use std::collections::HashMap;
328
329 if tokens.len() < window_size {
330 return Vec::new();
331 }
332
333 let mut matches = Vec::new();
334 let mut hash_map: HashMap<u64, Vec<usize>> = HashMap::new();
335 let mut i = 0;
336
337 while i <= tokens.len().saturating_sub(window_size) {
339 let current_hash = compute_window_hash(&tokens[i..i + window_size]);
341
342 if let Some(prev_indices) = hash_map.get(¤t_hash) {
344 for &prev_index in prev_indices.iter() {
346 if prev_index >= i {
348 continue;
349 }
350
351 if verify_window_match(tokens, prev_index, i, window_size) {
353 let mut extension = 0;
355 while (i + window_size + extension < tokens.len())
356 && (prev_index + window_size + extension < i)
357 && (tokens[prev_index + window_size + extension]
358 == tokens[i + window_size + extension])
359 {
360 extension += 1;
361 }
362
363 let total_length = window_size + extension;
364
365 matches.push(CloneMatch {
367 source_start: prev_index,
368 target_start: i,
369 length: total_length,
370 });
371
372 i += extension.max(1);
374 break; }
376 }
377 }
378
379 hash_map.entry(current_hash).or_default().push(i);
381 i += 1;
382 }
383
384 matches
385}
386
387fn compute_window_hash(window: &[Token]) -> u64 {
393 const BASE: u64 = 257;
395 const MODULUS: u64 = 1_000_000_007;
397
398 let mut hash: u64 = 0;
399 for token in window {
400 let token_hash = hash_token(token);
401 let wide_hash = (hash as u128 * BASE as u128 + token_hash as u128) % MODULUS as u128;
403 hash = wide_hash as u64;
404 }
405 hash
406}
407
408fn verify_window_match(tokens: &[Token], idx_a: usize, idx_b: usize, len: usize) -> bool {
410 if idx_a + len > tokens.len() || idx_b + len > tokens.len() {
411 return false;
412 }
413 tokens[idx_a..idx_a + len] == tokens[idx_b..idx_b + len]
414}
415
416fn get_keyword_set() -> &'static [&'static str] {
418 &[
419 "as",
421 "break",
422 "const",
423 "continue",
424 "crate",
425 "else",
426 "enum",
427 "extern",
428 "false",
429 "fn",
430 "for",
431 "if",
432 "impl",
433 "in",
434 "let",
435 "loop",
436 "match",
437 "mod",
438 "move",
439 "mut",
440 "pub",
441 "ref",
442 "return",
443 "self",
444 "Self",
445 "static",
446 "struct",
447 "super",
448 "trait",
449 "true",
450 "type",
451 "unsafe",
452 "use",
453 "where",
454 "while",
455 "async",
456 "await",
457 "dyn",
458 "and",
460 "assert",
461 "class",
462 "def",
463 "del",
464 "elif",
465 "except",
466 "finally",
467 "from",
468 "global",
469 "import",
470 "is",
471 "lambda",
472 "nonlocal",
473 "not",
474 "or",
475 "pass",
476 "raise",
477 "try",
478 "with",
479 "yield",
480 "await",
482 "case",
483 "catch",
484 "class",
485 "const",
486 "continue",
487 "debugger",
488 "default",
489 "delete",
490 "do",
491 "else",
492 "export",
493 "extends",
494 "finally",
495 "for",
496 "function",
497 "if",
498 "import",
499 "in",
500 "instanceof",
501 "let",
502 "new",
503 "return",
504 "super",
505 "switch",
506 "this",
507 "throw",
508 "try",
509 "typeof",
510 "var",
511 "void",
512 "while",
513 "with",
514 "yield",
515 ]
516}
517
518fn is_operator(s: &str) -> bool {
520 matches!(
521 s,
522 "+" | "-"
523 | "*"
524 | "/"
525 | "%"
526 | "="
527 | "=="
528 | "!="
529 | "<"
530 | ">"
531 | "<="
532 | ">="
533 | "&&"
534 | "||"
535 | "!"
536 | "&"
537 | "|"
538 | "^"
539 | "<<"
540 | ">>"
541 | "+="
542 | "-="
543 | "*="
544 | "/="
545 | "=>"
546 | "->"
547 | "::"
548 | "."
549 )
550}
551
552fn is_punctuation(ch: char) -> bool {
554 matches!(
555 ch,
556 '(' | ')' | '{' | '}' | '[' | ']' | ';' | ':' | ',' | '?'
557 )
558}
559
560#[cfg(test)]
561mod tests {
562 use super::*;
563
564 #[test]
565 fn test_normalize_rust_code() {
566 let code = r#"
567 fn add(x: i32, y: i32) -> i32 {
568 x + y
569 }
570 "#;
571
572 let tokens = normalize(code);
573 assert!(!tokens.is_empty());
574
575 assert!(tokens
577 .iter()
578 .any(|t| matches!(t, Token::Keyword(k) if k == "fn")));
579
580 assert!(tokens.contains(&Token::Identifier));
582 }
583
584 #[test]
585 fn test_normalize_python_code() {
586 let code = r#"
587 def greet(name):
588 return f"Hello, {name}!"
589 "#;
590
591 let tokens = normalize(code);
592 assert!(!tokens.is_empty());
593
594 assert!(tokens
596 .iter()
597 .any(|t| matches!(t, Token::Keyword(k) if k == "def")));
598 assert!(tokens
599 .iter()
600 .any(|t| matches!(t, Token::Keyword(k) if k == "return")));
601
602 assert!(tokens.contains(&Token::StringLiteral));
604 }
605
606 #[test]
607 fn test_normalize_javascript_code() {
608 let code = r#"
609 function multiply(a, b) {
610 return a * b;
611 }
612 "#;
613
614 let tokens = normalize(code);
615 assert!(!tokens.is_empty());
616
617 assert!(tokens
619 .iter()
620 .any(|t| matches!(t, Token::Keyword(k) if k == "function")));
621 assert!(tokens
622 .iter()
623 .any(|t| matches!(t, Token::Keyword(k) if k == "return")));
624 }
625
626 #[test]
627 fn test_normalize_ignores_comments() {
628 let code = r#"
629 // This is a comment
630 fn test() {
631 /* Multi-line
632 comment */
633 let x = 5; // inline comment
634 }
635 "#;
636
637 let tokens = normalize(code);
638
639 for token in &tokens {
641 if let Token::Identifier = token {
642 } else if let Token::Keyword(_) = token {
644 }
646 }
647 }
648
649 #[test]
650 fn test_rolling_hash_creation() {
651 let hasher = RollingHash::new(50);
652 assert_eq!(hasher.window_size(), 50);
653 assert_eq!(hasher.current_hash(), None);
654 }
655
656 #[test]
657 fn test_rolling_hash_basic() {
658 let mut hasher = RollingHash::new(3);
659
660 assert_eq!(hasher.roll(1), None); assert_eq!(hasher.roll(2), None); let hash1 = hasher.roll(3); assert!(hash1.is_some());
666
667 let hash2 = hasher.roll(4); assert!(hash2.is_some());
669
670 assert_ne!(hash1, hash2);
672 }
673
674 #[test]
675 fn test_compute_rolling_hashes() {
676 let tokens = vec![
677 Token::Keyword("fn".to_string()),
678 Token::Identifier,
679 Token::Punctuation("(".to_string()),
680 Token::Identifier,
681 Token::Punctuation(")".to_string()),
682 ];
683
684 let hashes = compute_rolling_hashes(&tokens, 3);
685 assert_eq!(hashes.len(), 3); }
687
688 #[test]
689 fn test_hash_token_consistency() {
690 let token1 = Token::Identifier;
691 let token2 = Token::Identifier;
692
693 assert_eq!(hash_token(&token1), hash_token(&token2));
694 }
695
696 #[test]
697 fn test_token_as_hash_string() {
698 assert_eq!(Token::Identifier.as_hash_string(), "$$ID");
699 assert_eq!(Token::StringLiteral.as_hash_string(), "$$STR");
700 assert_eq!(Token::NumberLiteral.as_hash_string(), "$$NUM");
701 assert_eq!(Token::Keyword("fn".to_string()).as_hash_string(), "fn");
702 }
703}