1use std::num::Wrapping;
10
11#[derive(Debug, Clone, PartialEq, Eq, Hash)]
16pub enum Token {
17 Keyword(String),
19 Identifier,
21 StringLiteral,
23 NumberLiteral,
25 Operator(String),
27 Punctuation(String),
29}
30
31impl Token {
32 pub fn as_hash_string(&self) -> &str {
34 match self {
35 Token::Keyword(kw) => kw.as_str(),
36 Token::Identifier => "$$ID",
37 Token::StringLiteral => "$$STR",
38 Token::NumberLiteral => "$$NUM",
39 Token::Operator(op) => op.as_str(),
40 Token::Punctuation(p) => p.as_str(),
41 }
42 }
43}
44
45pub fn normalize(code: &str) -> Vec<Token> {
61 let keywords = get_keyword_set();
62 let estimated_tokens = code.len() / 3;
64 let mut tokens = Vec::with_capacity(estimated_tokens);
65 let chars: Vec<char> = code.chars().collect();
66 let mut i = 0;
67
68 while i < chars.len() {
69 let ch = chars[i];
70
71 if ch.is_whitespace() {
73 i += 1;
74 continue;
75 }
76
77 if (ch == '/' && i + 1 < chars.len() && chars[i + 1] == '/') || ch == '#' {
79 while i < chars.len() && chars[i] != '\n' {
81 i += 1;
82 }
83 continue;
84 }
85
86 if ch == '/' && i + 1 < chars.len() && chars[i + 1] == '*' {
88 i += 2;
89 while i + 1 < chars.len() {
90 if chars[i] == '*' && chars[i + 1] == '/' {
91 i += 2;
92 break;
93 }
94 i += 1;
95 }
96 continue;
97 }
98
99 if ch == '"' || ch == '\'' {
101 let quote = ch;
102 i += 1;
103 while i < chars.len() {
105 if chars[i] == '\\' {
106 i += 2; continue;
108 }
109 if chars[i] == quote {
110 i += 1;
111 break;
112 }
113 i += 1;
114 }
115 tokens.push(Token::StringLiteral);
116 continue;
117 }
118
119 if ch.is_ascii_digit() {
121 while i < chars.len() && (chars[i].is_ascii_alphanumeric() || chars[i] == '.') {
122 i += 1;
123 }
124 tokens.push(Token::NumberLiteral);
125 continue;
126 }
127
128 if ch.is_alphabetic() || ch == '_' || ch == '$' {
130 let start = i;
131 while i < chars.len()
132 && (chars[i].is_alphanumeric() || chars[i] == '_' || chars[i] == '$')
133 {
134 i += 1;
135 }
136 let word: String = chars[start..i].iter().collect();
137
138 if keywords.contains(&word.as_str()) {
139 tokens.push(Token::Keyword(word));
140 } else {
141 tokens.push(Token::Identifier);
142 }
143 continue;
144 }
145
146 if i + 1 < chars.len() {
148 let two_char: String = chars[i..i + 2].iter().collect();
149 if is_operator(&two_char) {
150 tokens.push(Token::Operator(two_char));
151 i += 2;
152 continue;
153 }
154 }
155
156 let single = ch.to_string();
158 if is_operator(&single) {
159 tokens.push(Token::Operator(single));
160 } else if is_punctuation(ch) {
161 tokens.push(Token::Punctuation(single));
162 }
163
164 i += 1;
165 }
166
167 tokens
168}
169
170#[derive(Debug, Clone)]
175pub struct RollingHash {
176 window_size: usize,
178 base: Wrapping<u64>,
180 hash: Wrapping<u64>,
182 base_power: Wrapping<u64>,
184 window: Vec<u64>,
186}
187
188impl RollingHash {
189 pub fn new(window_size: usize) -> Self {
194 let base = Wrapping(257u64);
195 let mut base_power = Wrapping(1u64);
196
197 for _ in 0..window_size {
199 base_power *= base;
200 }
201
202 Self {
203 window_size,
204 base,
205 hash: Wrapping(0),
206 base_power,
207 window: Vec::with_capacity(window_size),
208 }
209 }
210
211 pub fn roll(&mut self, token_hash: u64) -> Option<u64> {
221 if self.window.len() < self.window_size {
222 self.window.push(token_hash);
224 self.hash = self.hash * self.base + Wrapping(token_hash);
225
226 if self.window.len() == self.window_size {
227 Some(self.hash.0)
228 } else {
229 None
230 }
231 } else {
232 let old_token = self.window.remove(0);
234 self.window.push(token_hash);
235
236 self.hash -= Wrapping(old_token) * self.base_power;
238 self.hash = self.hash * self.base + Wrapping(token_hash);
240
241 Some(self.hash.0)
242 }
243 }
244
245 pub fn reset(&mut self) {
247 self.hash = Wrapping(0);
248 self.window.clear();
249 }
250
251 pub fn current_hash(&self) -> Option<u64> {
253 if self.window.len() == self.window_size {
254 Some(self.hash.0)
255 } else {
256 None
257 }
258 }
259
260 pub fn window_size(&self) -> usize {
262 self.window_size
263 }
264}
265
266pub fn compute_rolling_hashes(tokens: &[Token], window_size: usize) -> Vec<(u64, usize)> {
275 if tokens.len() < window_size {
276 return Vec::new();
277 }
278
279 let mut hasher = RollingHash::new(window_size);
280 let mut hashes = Vec::new();
281
282 for (idx, token) in tokens.iter().enumerate() {
283 let token_hash = hash_token(token);
284 if let Some(hash) = hasher.roll(token_hash) {
285 let start_index = idx.saturating_sub(window_size - 1);
288 hashes.push((hash, start_index));
289 }
290 }
291
292 hashes
293}
294
295fn hash_token(token: &Token) -> u64 {
297 use std::collections::hash_map::DefaultHasher;
298 use std::hash::{Hash, Hasher};
299
300 let mut hasher = DefaultHasher::new();
301 token.as_hash_string().hash(&mut hasher);
302 hasher.finish()
303}
304
305#[derive(Debug, Clone)]
307pub struct CloneMatch {
308 pub source_start: usize,
309 pub target_start: usize,
310 pub length: usize,
312 pub target_length: usize,
314 pub similarity: f64,
316}
317
318pub fn detect_duplicates_with_extension(tokens: &[Token], window_size: usize) -> Vec<CloneMatch> {
333 use std::collections::HashMap;
334
335 if tokens.len() < window_size {
336 return Vec::new();
337 }
338
339 let mut matches = Vec::new();
340 let mut hash_map: HashMap<u64, Vec<usize>> = HashMap::new();
341 let mut i = 0;
342
343 while i <= tokens.len().saturating_sub(window_size) {
345 let current_hash = compute_window_hash(&tokens[i..i + window_size]);
347
348 if let Some(prev_indices) = hash_map.get(¤t_hash) {
350 for &prev_index in prev_indices.iter() {
352 if prev_index >= i {
354 continue;
355 }
356
357 if verify_window_match(tokens, prev_index, i, window_size) {
359 let mut extension = 0;
361 while (i + window_size + extension < tokens.len())
362 && (prev_index + window_size + extension < i)
363 && (tokens[prev_index + window_size + extension]
364 == tokens[i + window_size + extension])
365 {
366 extension += 1;
367 }
368
369 let total_length = window_size + extension;
370
371 matches.push(CloneMatch {
373 source_start: prev_index,
374 target_start: i,
375 length: total_length,
376 target_length: total_length,
377 similarity: 1.0, });
379
380 i += extension.max(1);
382 break; }
384 }
385 }
386
387 hash_map.entry(current_hash).or_default().push(i);
389 i += 1;
390 }
391
392 matches
393}
394
395fn compute_window_hash(window: &[Token]) -> u64 {
401 const BASE: u64 = 257;
403 const MODULUS: u64 = 1_000_000_007;
405
406 let mut hash: u64 = 0;
407 for token in window {
408 let token_hash = hash_token(token);
409 let wide_hash = (hash as u128 * BASE as u128 + token_hash as u128) % MODULUS as u128;
411 hash = wide_hash as u64;
412 }
413 hash
414}
415
416fn verify_window_match(tokens: &[Token], idx_a: usize, idx_b: usize, len: usize) -> bool {
418 if idx_a + len > tokens.len() || idx_b + len > tokens.len() {
419 return false;
420 }
421 tokens[idx_a..idx_a + len] == tokens[idx_b..idx_b + len]
422}
423
424fn get_keyword_set() -> &'static [&'static str] {
426 &[
427 "as",
429 "break",
430 "const",
431 "continue",
432 "crate",
433 "else",
434 "enum",
435 "extern",
436 "false",
437 "fn",
438 "for",
439 "if",
440 "impl",
441 "in",
442 "let",
443 "loop",
444 "match",
445 "mod",
446 "move",
447 "mut",
448 "pub",
449 "ref",
450 "return",
451 "self",
452 "Self",
453 "static",
454 "struct",
455 "super",
456 "trait",
457 "true",
458 "type",
459 "unsafe",
460 "use",
461 "where",
462 "while",
463 "async",
464 "await",
465 "dyn",
466 "and",
468 "assert",
469 "class",
470 "def",
471 "del",
472 "elif",
473 "except",
474 "finally",
475 "from",
476 "global",
477 "import",
478 "is",
479 "lambda",
480 "nonlocal",
481 "not",
482 "or",
483 "pass",
484 "raise",
485 "try",
486 "with",
487 "yield",
488 "await",
490 "case",
491 "catch",
492 "class",
493 "const",
494 "continue",
495 "debugger",
496 "default",
497 "delete",
498 "do",
499 "else",
500 "export",
501 "extends",
502 "finally",
503 "for",
504 "function",
505 "if",
506 "import",
507 "in",
508 "instanceof",
509 "let",
510 "new",
511 "return",
512 "super",
513 "switch",
514 "this",
515 "throw",
516 "try",
517 "typeof",
518 "var",
519 "void",
520 "while",
521 "with",
522 "yield",
523 ]
524}
525
526fn is_operator(s: &str) -> bool {
528 matches!(
529 s,
530 "+" | "-"
531 | "*"
532 | "/"
533 | "%"
534 | "="
535 | "=="
536 | "!="
537 | "<"
538 | ">"
539 | "<="
540 | ">="
541 | "&&"
542 | "||"
543 | "!"
544 | "&"
545 | "|"
546 | "^"
547 | "<<"
548 | ">>"
549 | "+="
550 | "-="
551 | "*="
552 | "/="
553 | "=>"
554 | "->"
555 | "::"
556 | "."
557 )
558}
559
560fn is_punctuation(ch: char) -> bool {
562 matches!(
563 ch,
564 '(' | ')' | '{' | '}' | '[' | ']' | ';' | ':' | ',' | '?'
565 )
566}
567
568pub fn compute_token_similarity(tokens1: &[Token], tokens2: &[Token]) -> f64 {
584 if tokens1.is_empty() && tokens2.is_empty() {
585 return 1.0;
586 }
587 if tokens1.is_empty() || tokens2.is_empty() {
588 return 0.0;
589 }
590
591 let distance = compute_token_edit_distance(tokens1, tokens2);
593 let max_len = tokens1.len().max(tokens2.len());
594
595 let similarity = 1.0 - (distance as f64 / max_len as f64);
597 similarity.clamp(0.0, 1.0)
598}
599
600pub fn compute_token_edit_distance(tokens1: &[Token], tokens2: &[Token]) -> usize {
613 let len1 = tokens1.len();
614 let len2 = tokens2.len();
615
616 if len1 == 0 {
617 return len2;
618 }
619 if len2 == 0 {
620 return len1;
621 }
622
623 let mut prev_row: Vec<usize> = (0..=len2).collect();
625 let mut curr_row: Vec<usize> = vec![0; len2 + 1];
626
627 for i in 1..=len1 {
628 curr_row[0] = i;
629
630 for j in 1..=len2 {
631 let cost = if tokens1[i - 1] == tokens2[j - 1] {
632 0
633 } else {
634 1
635 };
636
637 curr_row[j] = (prev_row[j - 1] + cost) .min(prev_row[j] + 1) .min(curr_row[j - 1] + 1); }
641
642 std::mem::swap(&mut prev_row, &mut curr_row);
643 }
644
645 prev_row[len2]
646}
647
648pub fn detect_type3_clones(
664 tokens1: &[Token],
665 tokens2: &[Token],
666 window_size: usize,
667 tolerance: f64,
668) -> Vec<CloneMatch> {
669 let mut matches = Vec::new();
670
671 let len_ratio = tokens1.len() as f64 / tokens2.len().max(1) as f64;
673 if !(0.8..=1.2).contains(&len_ratio) {
674 return matches;
675 }
676 if tokens1.len() < window_size || tokens2.len() < window_size || window_size == 0 {
677 return matches;
678 }
679
680 let base_delta = ((1.0 - tolerance) * window_size as f64).ceil() as usize;
683 let length_delta_limit = (base_delta + 2).max(1).min(window_size).min(20);
684 let similarity_floor = (tolerance - 0.3).max(0.0);
686
687 let mut i = 0;
689 while i + window_size <= tokens1.len() {
690 let mut j = 0;
691 while j + window_size <= tokens2.len() {
692 let window1 = &tokens1[i..i + window_size];
694 let window2 = &tokens2[j..j + window_size];
695
696 let similarity = compute_token_similarity(window1, window2);
698
699 if similarity < similarity_floor {
700 j += 1;
701 } else {
702 let max_extra1 =
703 length_delta_limit.min(tokens1.len().saturating_sub(i + window_size));
704 let max_extra2 =
705 length_delta_limit.min(tokens2.len().saturating_sub(j + window_size));
706
707 let mut best_match: Option<(usize, usize, f64)> = None;
708
709 for extra1 in 0..=max_extra1 {
710 let len1 = window_size + extra1;
711 let window1 = &tokens1[i..i + len1];
712
713 for extra2 in 0..=max_extra2 {
714 let len2 = window_size + extra2;
715
716 if len1.abs_diff(len2) > length_delta_limit {
717 continue;
718 }
719
720 let max_len = len1.max(len2);
721 let min_distance = len1.abs_diff(len2); let max_allowed_distance =
723 ((1.0 - tolerance) * max_len as f64).ceil() as usize;
724
725 if min_distance > max_allowed_distance {
726 continue;
727 }
728
729 let candidate_similarity = if extra1 == 0 && extra2 == 0 {
730 similarity
731 } else {
732 compute_token_similarity(window1, &tokens2[j..j + len2])
733 };
734
735 if candidate_similarity >= tolerance {
736 match best_match {
737 None => best_match = Some((len1, len2, candidate_similarity)),
738 Some((best_len1, best_len2, best_sim)) => {
739 let better_similarity =
740 candidate_similarity > best_sim + f64::EPSILON;
741 let better_coverage = !better_similarity
742 && len1.max(len2) > best_len1.max(best_len2);
743
744 if better_similarity || better_coverage {
745 best_match = Some((len1, len2, candidate_similarity));
746 }
747 }
748 }
749 }
750 }
751 }
752
753 if let Some((source_len, target_len, final_similarity)) = best_match {
754 matches.push(CloneMatch {
755 source_start: i,
756 target_start: j,
757 length: source_len,
758 target_length: target_len,
759 similarity: final_similarity,
760 });
761
762 let skip = target_len.saturating_sub(window_size).max(1);
764 j += skip;
765 } else {
766 j += 1;
767 }
768 }
769 }
770 i += 1;
771 }
772
773 matches
774}
775
776#[cfg(test)]
777mod tests {
778 use super::*;
779
780 #[test]
781 fn test_normalize_rust_code() {
782 let code = r#"
783 fn add(x: i32, y: i32) -> i32 {
784 x + y
785 }
786 "#;
787
788 let tokens = normalize(code);
789 assert!(!tokens.is_empty());
790
791 assert!(tokens
793 .iter()
794 .any(|t| matches!(t, Token::Keyword(k) if k == "fn")));
795
796 assert!(tokens.contains(&Token::Identifier));
798 }
799
800 #[test]
801 fn test_normalize_python_code() {
802 let code = r#"
803 def greet(name):
804 return f"Hello, {name}!"
805 "#;
806
807 let tokens = normalize(code);
808 assert!(!tokens.is_empty());
809
810 assert!(tokens
812 .iter()
813 .any(|t| matches!(t, Token::Keyword(k) if k == "def")));
814 assert!(tokens
815 .iter()
816 .any(|t| matches!(t, Token::Keyword(k) if k == "return")));
817
818 assert!(tokens.contains(&Token::StringLiteral));
820 }
821
822 #[test]
823 fn test_normalize_javascript_code() {
824 let code = r#"
825 function multiply(a, b) {
826 return a * b;
827 }
828 "#;
829
830 let tokens = normalize(code);
831 assert!(!tokens.is_empty());
832
833 assert!(tokens
835 .iter()
836 .any(|t| matches!(t, Token::Keyword(k) if k == "function")));
837 assert!(tokens
838 .iter()
839 .any(|t| matches!(t, Token::Keyword(k) if k == "return")));
840 }
841
842 #[test]
843 fn test_normalize_ignores_comments() {
844 let code = r#"
845 // This is a comment
846 fn test() {
847 /* Multi-line
848 comment */
849 let x = 5; // inline comment
850 }
851 "#;
852
853 let tokens = normalize(code);
854
855 for token in &tokens {
857 if let Token::Identifier = token {
858 } else if let Token::Keyword(_) = token {
860 }
862 }
863 }
864
865 #[test]
866 fn test_rolling_hash_creation() {
867 let hasher = RollingHash::new(50);
868 assert_eq!(hasher.window_size(), 50);
869 assert_eq!(hasher.current_hash(), None);
870 }
871
872 #[test]
873 fn test_rolling_hash_basic() {
874 let mut hasher = RollingHash::new(3);
875
876 assert_eq!(hasher.roll(1), None); assert_eq!(hasher.roll(2), None); let hash1 = hasher.roll(3); assert!(hash1.is_some());
882
883 let hash2 = hasher.roll(4); assert!(hash2.is_some());
885
886 assert_ne!(hash1, hash2);
888 }
889
890 #[test]
891 fn test_compute_rolling_hashes() {
892 let tokens = vec![
893 Token::Keyword("fn".to_string()),
894 Token::Identifier,
895 Token::Punctuation("(".to_string()),
896 Token::Identifier,
897 Token::Punctuation(")".to_string()),
898 ];
899
900 let hashes = compute_rolling_hashes(&tokens, 3);
901 assert_eq!(hashes.len(), 3); }
903
904 #[test]
905 fn test_hash_token_consistency() {
906 let token1 = Token::Identifier;
907 let token2 = Token::Identifier;
908
909 assert_eq!(hash_token(&token1), hash_token(&token2));
910 }
911
912 #[test]
913 fn test_token_as_hash_string() {
914 assert_eq!(Token::Identifier.as_hash_string(), "$$ID");
915 assert_eq!(Token::StringLiteral.as_hash_string(), "$$STR");
916 assert_eq!(Token::NumberLiteral.as_hash_string(), "$$NUM");
917 assert_eq!(Token::Keyword("fn".to_string()).as_hash_string(), "fn");
918 }
919}