1use serde::{Deserialize, Serialize};
10use std::num::Wrapping;
11
12#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
17pub enum Token {
18 Keyword(String),
20 Identifier,
22 StringLiteral,
24 NumberLiteral,
26 Operator(String),
28 Punctuation(String),
30}
31
32impl Token {
33 pub fn as_hash_string(&self) -> &str {
35 match self {
36 Token::Keyword(kw) => kw.as_str(),
37 Token::Identifier => "$$ID",
38 Token::StringLiteral => "$$STR",
39 Token::NumberLiteral => "$$NUM",
40 Token::Operator(op) => op.as_str(),
41 Token::Punctuation(p) => p.as_str(),
42 }
43 }
44}
45
46pub fn normalize(code: &str) -> Vec<Token> {
62 let (tokens, _) = normalize_with_line_numbers(code);
63 tokens
64}
65
66pub fn normalize_with_line_numbers(code: &str) -> (Vec<Token>, Vec<usize>) {
71 let keywords = get_keyword_set();
72 let estimated_tokens = code.len() / 3;
74 let mut tokens = Vec::with_capacity(estimated_tokens);
75 let mut line_offsets = Vec::with_capacity(estimated_tokens);
76 let chars: Vec<char> = code.chars().collect();
77 let mut i = 0;
78 let mut line = 0;
79
80 while i < chars.len() {
81 let ch = chars[i];
82
83 if ch.is_whitespace() {
85 if ch == '\n' {
86 line += 1;
87 }
88 i += 1;
89 continue;
90 }
91
92 if (ch == '/' && i + 1 < chars.len() && chars[i + 1] == '/') || ch == '#' {
94 while i < chars.len() && chars[i] != '\n' {
96 i += 1;
97 }
98 continue;
99 }
100
101 if ch == '/' && i + 1 < chars.len() && chars[i + 1] == '*' {
103 i += 2;
104 while i + 1 < chars.len() {
105 if chars[i] == '\n' {
106 line += 1;
107 }
108 if chars[i] == '*' && chars[i + 1] == '/' {
109 i += 2;
110 break;
111 }
112 i += 1;
113 }
114 continue;
115 }
116
117 if ch == '"' || ch == '\'' {
119 let quote = ch;
120 let token_line = line;
121 i += 1;
122 while i < chars.len() {
124 if chars[i] == '\\' {
125 i += 2; continue;
127 }
128 if chars[i] == '\n' {
129 line += 1;
130 }
131 if chars[i] == quote {
132 i += 1;
133 break;
134 }
135 i += 1;
136 }
137 tokens.push(Token::StringLiteral);
138 line_offsets.push(token_line);
139 continue;
140 }
141
142 if ch.is_ascii_digit() {
144 let token_line = line;
145 while i < chars.len() && (chars[i].is_ascii_alphanumeric() || chars[i] == '.') {
146 i += 1;
147 }
148 tokens.push(Token::NumberLiteral);
149 line_offsets.push(token_line);
150 continue;
151 }
152
153 if ch.is_alphabetic() || ch == '_' || ch == '$' {
155 let token_line = line;
156 let start = i;
157 while i < chars.len()
158 && (chars[i].is_alphanumeric() || chars[i] == '_' || chars[i] == '$')
159 {
160 i += 1;
161 }
162 let word: String = chars[start..i].iter().collect();
163
164 if keywords.contains(&word.as_str()) {
165 tokens.push(Token::Keyword(word));
166 } else {
167 tokens.push(Token::Identifier);
168 }
169 line_offsets.push(token_line);
170 continue;
171 }
172
173 if i + 1 < chars.len() {
175 let two_char: String = chars[i..i + 2].iter().collect();
176 if is_operator(&two_char) {
177 let token_line = line;
178 tokens.push(Token::Operator(two_char));
179 i += 2;
180 line_offsets.push(token_line);
181 continue;
182 }
183 }
184
185 let single = ch.to_string();
187 if is_operator(&single) {
188 let token_line = line;
189 tokens.push(Token::Operator(single));
190 line_offsets.push(token_line);
191 } else if is_punctuation(ch) {
192 let token_line = line;
193 tokens.push(Token::Punctuation(single));
194 line_offsets.push(token_line);
195 }
196
197 i += 1;
198 }
199
200 (tokens, line_offsets)
201}
202
203#[derive(Debug, Clone)]
208pub struct RollingHash {
209 window_size: usize,
211 base: Wrapping<u64>,
213 hash: Wrapping<u64>,
215 base_power: Wrapping<u64>,
217 window: Vec<u64>,
219}
220
221impl RollingHash {
222 pub fn new(window_size: usize) -> Self {
227 let base = Wrapping(257u64);
228 let mut base_power = Wrapping(1u64);
229
230 for _ in 0..window_size {
232 base_power *= base;
233 }
234
235 Self {
236 window_size,
237 base,
238 hash: Wrapping(0),
239 base_power,
240 window: Vec::with_capacity(window_size),
241 }
242 }
243
244 pub fn roll(&mut self, token_hash: u64) -> Option<u64> {
254 if self.window.len() < self.window_size {
255 self.window.push(token_hash);
257 self.hash = self.hash * self.base + Wrapping(token_hash);
258
259 if self.window.len() == self.window_size {
260 Some(self.hash.0)
261 } else {
262 None
263 }
264 } else {
265 let old_token = self.window.remove(0);
267 self.window.push(token_hash);
268
269 self.hash -= Wrapping(old_token) * self.base_power;
271 self.hash = self.hash * self.base + Wrapping(token_hash);
273
274 Some(self.hash.0)
275 }
276 }
277
278 pub fn reset(&mut self) {
280 self.hash = Wrapping(0);
281 self.window.clear();
282 }
283
284 pub fn current_hash(&self) -> Option<u64> {
286 if self.window.len() == self.window_size {
287 Some(self.hash.0)
288 } else {
289 None
290 }
291 }
292
293 pub fn window_size(&self) -> usize {
295 self.window_size
296 }
297}
298
299pub fn compute_rolling_hashes(tokens: &[Token], window_size: usize) -> Vec<(u64, usize)> {
308 if tokens.len() < window_size {
309 return Vec::new();
310 }
311
312 let mut hasher = RollingHash::new(window_size);
313 let mut hashes = Vec::new();
314
315 for (idx, token) in tokens.iter().enumerate() {
316 let token_hash = hash_token(token);
317 if let Some(hash) = hasher.roll(token_hash) {
318 let start_index = idx.saturating_sub(window_size - 1);
321 hashes.push((hash, start_index));
322 }
323 }
324
325 hashes
326}
327
328fn hash_token(token: &Token) -> u64 {
330 use std::collections::hash_map::DefaultHasher;
331 use std::hash::{Hash, Hasher};
332
333 let mut hasher = DefaultHasher::new();
334 token.as_hash_string().hash(&mut hasher);
335 hasher.finish()
336}
337
338#[derive(Debug, Clone)]
340pub struct CloneMatch {
341 pub source_start: usize,
342 pub target_start: usize,
343 pub length: usize,
345 pub target_length: usize,
347 pub similarity: f64,
349}
350
351pub fn detect_duplicates_with_extension(tokens: &[Token], window_size: usize) -> Vec<CloneMatch> {
366 use std::collections::HashMap;
367
368 if tokens.len() < window_size {
369 return Vec::new();
370 }
371
372 let mut matches = Vec::new();
373 let mut hash_map: HashMap<u64, Vec<usize>> = HashMap::new();
374 let mut i = 0;
375
376 while i <= tokens.len().saturating_sub(window_size) {
378 let current_hash = compute_window_hash(&tokens[i..i + window_size]);
380
381 if let Some(prev_indices) = hash_map.get(¤t_hash) {
383 for &prev_index in prev_indices.iter() {
385 if prev_index >= i {
387 continue;
388 }
389
390 if verify_window_match(tokens, prev_index, i, window_size) {
392 let mut extension = 0;
394 while (i + window_size + extension < tokens.len())
395 && (prev_index + window_size + extension < i)
396 && (tokens[prev_index + window_size + extension]
397 == tokens[i + window_size + extension])
398 {
399 extension += 1;
400 }
401
402 let total_length = window_size + extension;
403
404 matches.push(CloneMatch {
406 source_start: prev_index,
407 target_start: i,
408 length: total_length,
409 target_length: total_length,
410 similarity: 1.0, });
412
413 i += extension.max(1);
415 break; }
417 }
418 }
419
420 hash_map.entry(current_hash).or_default().push(i);
422 i += 1;
423 }
424
425 matches
426}
427
428pub fn compute_window_hash(window: &[Token]) -> u64 {
434 const BASE: u64 = 257;
436 const MODULUS: u64 = 1_000_000_007;
438
439 let mut hash: u64 = 0;
440 for token in window {
441 let token_hash = hash_token(token);
442 let wide_hash = (hash as u128 * BASE as u128 + token_hash as u128) % MODULUS as u128;
444 hash = wide_hash as u64;
445 }
446 hash
447}
448
449fn verify_window_match(tokens: &[Token], idx_a: usize, idx_b: usize, len: usize) -> bool {
451 if idx_a + len > tokens.len() || idx_b + len > tokens.len() {
452 return false;
453 }
454 tokens[idx_a..idx_a + len] == tokens[idx_b..idx_b + len]
455}
456
457pub fn verify_cross_window_match(
461 tokens1: &[Token],
462 tokens2: &[Token],
463 idx1: usize,
464 idx2: usize,
465 len: usize,
466) -> bool {
467 if idx1 + len > tokens1.len() || idx2 + len > tokens2.len() {
468 return false;
469 }
470 tokens1[idx1..idx1 + len] == tokens2[idx2..idx2 + len]
471}
472
473pub fn extend_match(
488 tokens1: &[Token],
489 tokens2: &[Token],
490 pos1: usize,
491 pos2: usize,
492 initial_len: usize,
493) -> usize {
494 let mut extension = 0;
495 while (pos1 + initial_len + extension < tokens1.len())
496 && (pos2 + initial_len + extension < tokens2.len())
497 && (tokens1[pos1 + initial_len + extension] == tokens2[pos2 + initial_len + extension])
498 {
499 extension += 1;
500 }
501 extension
502}
503
504fn get_keyword_set() -> &'static [&'static str] {
506 &[
507 "as",
509 "break",
510 "const",
511 "continue",
512 "crate",
513 "else",
514 "enum",
515 "extern",
516 "false",
517 "fn",
518 "for",
519 "if",
520 "impl",
521 "in",
522 "let",
523 "loop",
524 "match",
525 "mod",
526 "move",
527 "mut",
528 "pub",
529 "ref",
530 "return",
531 "self",
532 "Self",
533 "static",
534 "struct",
535 "super",
536 "trait",
537 "true",
538 "type",
539 "unsafe",
540 "use",
541 "where",
542 "while",
543 "async",
544 "await",
545 "dyn",
546 "and",
548 "assert",
549 "class",
550 "def",
551 "del",
552 "elif",
553 "except",
554 "finally",
555 "from",
556 "global",
557 "import",
558 "is",
559 "lambda",
560 "nonlocal",
561 "not",
562 "or",
563 "pass",
564 "raise",
565 "try",
566 "with",
567 "yield",
568 "await",
570 "case",
571 "catch",
572 "class",
573 "const",
574 "continue",
575 "debugger",
576 "default",
577 "delete",
578 "do",
579 "else",
580 "export",
581 "extends",
582 "finally",
583 "for",
584 "function",
585 "if",
586 "import",
587 "in",
588 "instanceof",
589 "let",
590 "new",
591 "return",
592 "super",
593 "switch",
594 "this",
595 "throw",
596 "try",
597 "typeof",
598 "var",
599 "void",
600 "while",
601 "with",
602 "yield",
603 ]
604}
605
606fn is_operator(s: &str) -> bool {
608 matches!(
609 s,
610 "+" | "-"
611 | "*"
612 | "/"
613 | "%"
614 | "="
615 | "=="
616 | "!="
617 | "<"
618 | ">"
619 | "<="
620 | ">="
621 | "&&"
622 | "||"
623 | "!"
624 | "&"
625 | "|"
626 | "^"
627 | "<<"
628 | ">>"
629 | "+="
630 | "-="
631 | "*="
632 | "/="
633 | "=>"
634 | "->"
635 | "::"
636 | "."
637 )
638}
639
640fn is_punctuation(ch: char) -> bool {
642 matches!(
643 ch,
644 '(' | ')' | '{' | '}' | '[' | ']' | ';' | ':' | ',' | '?'
645 )
646}
647
648pub fn compute_token_similarity(tokens1: &[Token], tokens2: &[Token]) -> f64 {
664 if tokens1.is_empty() && tokens2.is_empty() {
665 return 1.0;
666 }
667 if tokens1.is_empty() || tokens2.is_empty() {
668 return 0.0;
669 }
670
671 let distance = compute_token_edit_distance(tokens1, tokens2);
673 let max_len = tokens1.len().max(tokens2.len());
674
675 let similarity = 1.0 - (distance as f64 / max_len as f64);
677 similarity.clamp(0.0, 1.0)
678}
679
680pub fn compute_token_edit_distance(tokens1: &[Token], tokens2: &[Token]) -> usize {
693 let len1 = tokens1.len();
694 let len2 = tokens2.len();
695
696 if len1 == 0 {
697 return len2;
698 }
699 if len2 == 0 {
700 return len1;
701 }
702
703 let mut prev_row: Vec<usize> = (0..=len2).collect();
705 let mut curr_row: Vec<usize> = vec![0; len2 + 1];
706
707 for i in 1..=len1 {
708 curr_row[0] = i;
709
710 for j in 1..=len2 {
711 let cost = if tokens1[i - 1] == tokens2[j - 1] {
712 0
713 } else {
714 1
715 };
716
717 curr_row[j] = (prev_row[j - 1] + cost) .min(prev_row[j] + 1) .min(curr_row[j - 1] + 1); }
721
722 std::mem::swap(&mut prev_row, &mut curr_row);
723 }
724
725 prev_row[len2]
726}
727
728#[allow(clippy::too_many_arguments)]
736fn explore_length_variants(
737 tokens1: &[Token],
738 tokens2: &[Token],
739 i: usize,
740 j: usize,
741 window_size: usize,
742 base_similarity: f64,
743 length_delta_limit: usize,
744 tolerance: f64,
745) -> Option<(usize, usize, f64)> {
746 let max_extra1 = length_delta_limit.min(tokens1.len().saturating_sub(i + window_size));
747 let max_extra2 = length_delta_limit.min(tokens2.len().saturating_sub(j + window_size));
748
749 let mut best_match: Option<(usize, usize, f64)> = None;
750
751 for extra1 in 0..=max_extra1 {
752 let len1 = window_size + extra1;
753 let window1 = &tokens1[i..i + len1];
754
755 for extra2 in 0..=max_extra2 {
756 let len2 = window_size + extra2;
757
758 if len1.abs_diff(len2) > length_delta_limit {
760 continue;
761 }
762
763 let max_len = len1.max(len2);
765 let min_distance = len1.abs_diff(len2);
766 let max_allowed_distance = ((1.0 - tolerance) * max_len as f64).ceil() as usize;
767
768 if min_distance > max_allowed_distance {
769 continue;
770 }
771
772 let candidate_similarity = if extra1 == 0 && extra2 == 0 {
774 base_similarity
775 } else {
776 compute_token_similarity(window1, &tokens2[j..j + len2])
777 };
778
779 if candidate_similarity >= tolerance {
780 best_match = match best_match {
781 None => Some((len1, len2, candidate_similarity)),
782 Some((best_len1, best_len2, best_sim)) => {
783 let better_similarity = candidate_similarity > best_sim + f64::EPSILON;
784 let better_coverage =
785 !better_similarity && len1.max(len2) > best_len1.max(best_len2);
786
787 if better_similarity || better_coverage {
788 Some((len1, len2, candidate_similarity))
789 } else {
790 best_match
791 }
792 }
793 };
794 }
795 }
796 }
797
798 best_match
799}
800
801pub fn detect_type3_clones(
817 tokens1: &[Token],
818 tokens2: &[Token],
819 window_size: usize,
820 tolerance: f64,
821) -> Vec<CloneMatch> {
822 let mut matches = Vec::new();
823
824 let len_ratio = tokens1.len() as f64 / tokens2.len().max(1) as f64;
826 if !(0.8..=1.2).contains(&len_ratio) {
827 return matches;
828 }
829 if tokens1.len() < window_size || tokens2.len() < window_size || window_size == 0 {
830 return matches;
831 }
832
833 let base_delta = ((1.0 - tolerance) * window_size as f64).ceil() as usize;
836 let length_delta_limit = (base_delta + 2).max(1).min(window_size).min(20);
837 let similarity_floor = (tolerance - 0.3).max(0.0);
839
840 let mut i = 0;
842 while i + window_size <= tokens1.len() {
843 let mut j = 0;
844 while j + window_size <= tokens2.len() {
845 let window1 = &tokens1[i..i + window_size];
847 let window2 = &tokens2[j..j + window_size];
848 let similarity = compute_token_similarity(window1, window2);
849
850 if similarity < similarity_floor {
851 j += 1;
852 continue;
853 }
854
855 let best_match = explore_length_variants(
857 tokens1,
858 tokens2,
859 i,
860 j,
861 window_size,
862 similarity,
863 length_delta_limit,
864 tolerance,
865 );
866
867 if let Some((source_len, target_len, final_similarity)) = best_match {
868 matches.push(CloneMatch {
869 source_start: i,
870 target_start: j,
871 length: source_len,
872 target_length: target_len,
873 similarity: final_similarity,
874 });
875
876 let skip = target_len.saturating_sub(window_size).max(1);
878 j += skip;
879 } else {
880 j += 1;
881 }
882 }
883 i += 1;
884 }
885
886 matches
887}
888
889#[cfg(test)]
890mod tests {
891 use super::*;
892
893 #[test]
894 fn test_normalize_rust_code() {
895 let code = r#"
896 fn add(x: i32, y: i32) -> i32 {
897 x + y
898 }
899 "#;
900
901 let tokens = normalize(code);
902 assert!(!tokens.is_empty());
903
904 assert!(tokens
906 .iter()
907 .any(|t| matches!(t, Token::Keyword(k) if k == "fn")));
908
909 assert!(tokens.contains(&Token::Identifier));
911 }
912
913 #[test]
914 fn test_normalize_python_code() {
915 let code = r#"
916 def greet(name):
917 return f"Hello, {name}!"
918 "#;
919
920 let tokens = normalize(code);
921 assert!(!tokens.is_empty());
922
923 assert!(tokens
925 .iter()
926 .any(|t| matches!(t, Token::Keyword(k) if k == "def")));
927 assert!(tokens
928 .iter()
929 .any(|t| matches!(t, Token::Keyword(k) if k == "return")));
930
931 assert!(tokens.contains(&Token::StringLiteral));
933 }
934
935 #[test]
936 fn test_normalize_javascript_code() {
937 let code = r#"
938 function multiply(a, b) {
939 return a * b;
940 }
941 "#;
942
943 let tokens = normalize(code);
944 assert!(!tokens.is_empty());
945
946 assert!(tokens
948 .iter()
949 .any(|t| matches!(t, Token::Keyword(k) if k == "function")));
950 assert!(tokens
951 .iter()
952 .any(|t| matches!(t, Token::Keyword(k) if k == "return")));
953 }
954
955 #[test]
956 fn test_normalize_ignores_comments() {
957 let code = r#"
958 // This is a comment
959 fn test() {
960 /* Multi-line
961 comment */
962 let x = 5; // inline comment
963 }
964 "#;
965
966 let tokens = normalize(code);
967
968 for token in &tokens {
970 if let Token::Identifier = token {
971 } else if let Token::Keyword(_) = token {
973 }
975 }
976 }
977
978 #[test]
979 fn test_rolling_hash_creation() {
980 let hasher = RollingHash::new(50);
981 assert_eq!(hasher.window_size(), 50);
982 assert_eq!(hasher.current_hash(), None);
983 }
984
985 #[test]
986 fn test_rolling_hash_basic() {
987 let mut hasher = RollingHash::new(3);
988
989 assert_eq!(hasher.roll(1), None); assert_eq!(hasher.roll(2), None); let hash1 = hasher.roll(3); assert!(hash1.is_some());
995
996 let hash2 = hasher.roll(4); assert!(hash2.is_some());
998
999 assert_ne!(hash1, hash2);
1001 }
1002
1003 #[test]
1004 fn test_compute_rolling_hashes() {
1005 let tokens = vec![
1006 Token::Keyword("fn".to_string()),
1007 Token::Identifier,
1008 Token::Punctuation("(".to_string()),
1009 Token::Identifier,
1010 Token::Punctuation(")".to_string()),
1011 ];
1012
1013 let hashes = compute_rolling_hashes(&tokens, 3);
1014 assert_eq!(hashes.len(), 3); }
1016
1017 #[test]
1018 fn test_hash_token_consistency() {
1019 let token1 = Token::Identifier;
1020 let token2 = Token::Identifier;
1021
1022 assert_eq!(hash_token(&token1), hash_token(&token2));
1023 }
1024
1025 #[test]
1026 fn test_token_as_hash_string() {
1027 assert_eq!(Token::Identifier.as_hash_string(), "$$ID");
1028 assert_eq!(Token::StringLiteral.as_hash_string(), "$$STR");
1029 assert_eq!(Token::NumberLiteral.as_hash_string(), "$$NUM");
1030 assert_eq!(Token::Keyword("fn".to_string()).as_hash_string(), "fn");
1031 }
1032}