1use std::num::Wrapping;
10
11#[derive(Debug, Clone, PartialEq, Eq, Hash)]
16pub enum Token {
17 Keyword(String),
19 Identifier,
21 StringLiteral,
23 NumberLiteral,
25 Operator(String),
27 Punctuation(String),
29}
30
31impl Token {
32 pub fn as_hash_string(&self) -> &str {
34 match self {
35 Token::Keyword(kw) => kw.as_str(),
36 Token::Identifier => "$$ID",
37 Token::StringLiteral => "$$STR",
38 Token::NumberLiteral => "$$NUM",
39 Token::Operator(op) => op.as_str(),
40 Token::Punctuation(p) => p.as_str(),
41 }
42 }
43}
44
45pub fn normalize(code: &str) -> Vec<Token> {
61 let (tokens, _) = normalize_with_line_numbers(code);
62 tokens
63}
64
65pub fn normalize_with_line_numbers(code: &str) -> (Vec<Token>, Vec<usize>) {
70 let keywords = get_keyword_set();
71 let estimated_tokens = code.len() / 3;
73 let mut tokens = Vec::with_capacity(estimated_tokens);
74 let mut line_offsets = Vec::with_capacity(estimated_tokens);
75 let chars: Vec<char> = code.chars().collect();
76 let mut i = 0;
77 let mut line = 0;
78
79 while i < chars.len() {
80 let ch = chars[i];
81
82 if ch.is_whitespace() {
84 if ch == '\n' {
85 line += 1;
86 }
87 i += 1;
88 continue;
89 }
90
91 if (ch == '/' && i + 1 < chars.len() && chars[i + 1] == '/') || ch == '#' {
93 while i < chars.len() && chars[i] != '\n' {
95 i += 1;
96 }
97 continue;
98 }
99
100 if ch == '/' && i + 1 < chars.len() && chars[i + 1] == '*' {
102 i += 2;
103 while i + 1 < chars.len() {
104 if chars[i] == '\n' {
105 line += 1;
106 }
107 if chars[i] == '*' && chars[i + 1] == '/' {
108 i += 2;
109 break;
110 }
111 i += 1;
112 }
113 continue;
114 }
115
116 if ch == '"' || ch == '\'' {
118 let quote = ch;
119 let token_line = line;
120 i += 1;
121 while i < chars.len() {
123 if chars[i] == '\\' {
124 i += 2; continue;
126 }
127 if chars[i] == '\n' {
128 line += 1;
129 }
130 if chars[i] == quote {
131 i += 1;
132 break;
133 }
134 i += 1;
135 }
136 tokens.push(Token::StringLiteral);
137 line_offsets.push(token_line);
138 continue;
139 }
140
141 if ch.is_ascii_digit() {
143 let token_line = line;
144 while i < chars.len() && (chars[i].is_ascii_alphanumeric() || chars[i] == '.') {
145 i += 1;
146 }
147 tokens.push(Token::NumberLiteral);
148 line_offsets.push(token_line);
149 continue;
150 }
151
152 if ch.is_alphabetic() || ch == '_' || ch == '$' {
154 let token_line = line;
155 let start = i;
156 while i < chars.len()
157 && (chars[i].is_alphanumeric() || chars[i] == '_' || chars[i] == '$')
158 {
159 i += 1;
160 }
161 let word: String = chars[start..i].iter().collect();
162
163 if keywords.contains(&word.as_str()) {
164 tokens.push(Token::Keyword(word));
165 } else {
166 tokens.push(Token::Identifier);
167 }
168 line_offsets.push(token_line);
169 continue;
170 }
171
172 if i + 1 < chars.len() {
174 let two_char: String = chars[i..i + 2].iter().collect();
175 if is_operator(&two_char) {
176 let token_line = line;
177 tokens.push(Token::Operator(two_char));
178 i += 2;
179 line_offsets.push(token_line);
180 continue;
181 }
182 }
183
184 let single = ch.to_string();
186 if is_operator(&single) {
187 let token_line = line;
188 tokens.push(Token::Operator(single));
189 line_offsets.push(token_line);
190 } else if is_punctuation(ch) {
191 let token_line = line;
192 tokens.push(Token::Punctuation(single));
193 line_offsets.push(token_line);
194 }
195
196 i += 1;
197 }
198
199 (tokens, line_offsets)
200}
201
202#[derive(Debug, Clone)]
207pub struct RollingHash {
208 window_size: usize,
210 base: Wrapping<u64>,
212 hash: Wrapping<u64>,
214 base_power: Wrapping<u64>,
216 window: Vec<u64>,
218}
219
220impl RollingHash {
221 pub fn new(window_size: usize) -> Self {
226 let base = Wrapping(257u64);
227 let mut base_power = Wrapping(1u64);
228
229 for _ in 0..window_size {
231 base_power *= base;
232 }
233
234 Self {
235 window_size,
236 base,
237 hash: Wrapping(0),
238 base_power,
239 window: Vec::with_capacity(window_size),
240 }
241 }
242
243 pub fn roll(&mut self, token_hash: u64) -> Option<u64> {
253 if self.window.len() < self.window_size {
254 self.window.push(token_hash);
256 self.hash = self.hash * self.base + Wrapping(token_hash);
257
258 if self.window.len() == self.window_size {
259 Some(self.hash.0)
260 } else {
261 None
262 }
263 } else {
264 let old_token = self.window.remove(0);
266 self.window.push(token_hash);
267
268 self.hash -= Wrapping(old_token) * self.base_power;
270 self.hash = self.hash * self.base + Wrapping(token_hash);
272
273 Some(self.hash.0)
274 }
275 }
276
277 pub fn reset(&mut self) {
279 self.hash = Wrapping(0);
280 self.window.clear();
281 }
282
283 pub fn current_hash(&self) -> Option<u64> {
285 if self.window.len() == self.window_size {
286 Some(self.hash.0)
287 } else {
288 None
289 }
290 }
291
292 pub fn window_size(&self) -> usize {
294 self.window_size
295 }
296}
297
298pub fn compute_rolling_hashes(tokens: &[Token], window_size: usize) -> Vec<(u64, usize)> {
307 if tokens.len() < window_size {
308 return Vec::new();
309 }
310
311 let mut hasher = RollingHash::new(window_size);
312 let mut hashes = Vec::new();
313
314 for (idx, token) in tokens.iter().enumerate() {
315 let token_hash = hash_token(token);
316 if let Some(hash) = hasher.roll(token_hash) {
317 let start_index = idx.saturating_sub(window_size - 1);
320 hashes.push((hash, start_index));
321 }
322 }
323
324 hashes
325}
326
327fn hash_token(token: &Token) -> u64 {
329 use std::collections::hash_map::DefaultHasher;
330 use std::hash::{Hash, Hasher};
331
332 let mut hasher = DefaultHasher::new();
333 token.as_hash_string().hash(&mut hasher);
334 hasher.finish()
335}
336
337#[derive(Debug, Clone)]
339pub struct CloneMatch {
340 pub source_start: usize,
341 pub target_start: usize,
342 pub length: usize,
344 pub target_length: usize,
346 pub similarity: f64,
348}
349
350pub fn detect_duplicates_with_extension(tokens: &[Token], window_size: usize) -> Vec<CloneMatch> {
365 use std::collections::HashMap;
366
367 if tokens.len() < window_size {
368 return Vec::new();
369 }
370
371 let mut matches = Vec::new();
372 let mut hash_map: HashMap<u64, Vec<usize>> = HashMap::new();
373 let mut i = 0;
374
375 while i <= tokens.len().saturating_sub(window_size) {
377 let current_hash = compute_window_hash(&tokens[i..i + window_size]);
379
380 if let Some(prev_indices) = hash_map.get(¤t_hash) {
382 for &prev_index in prev_indices.iter() {
384 if prev_index >= i {
386 continue;
387 }
388
389 if verify_window_match(tokens, prev_index, i, window_size) {
391 let mut extension = 0;
393 while (i + window_size + extension < tokens.len())
394 && (prev_index + window_size + extension < i)
395 && (tokens[prev_index + window_size + extension]
396 == tokens[i + window_size + extension])
397 {
398 extension += 1;
399 }
400
401 let total_length = window_size + extension;
402
403 matches.push(CloneMatch {
405 source_start: prev_index,
406 target_start: i,
407 length: total_length,
408 target_length: total_length,
409 similarity: 1.0, });
411
412 i += extension.max(1);
414 break; }
416 }
417 }
418
419 hash_map.entry(current_hash).or_default().push(i);
421 i += 1;
422 }
423
424 matches
425}
426
427pub fn compute_window_hash(window: &[Token]) -> u64 {
433 const BASE: u64 = 257;
435 const MODULUS: u64 = 1_000_000_007;
437
438 let mut hash: u64 = 0;
439 for token in window {
440 let token_hash = hash_token(token);
441 let wide_hash = (hash as u128 * BASE as u128 + token_hash as u128) % MODULUS as u128;
443 hash = wide_hash as u64;
444 }
445 hash
446}
447
448fn verify_window_match(tokens: &[Token], idx_a: usize, idx_b: usize, len: usize) -> bool {
450 if idx_a + len > tokens.len() || idx_b + len > tokens.len() {
451 return false;
452 }
453 tokens[idx_a..idx_a + len] == tokens[idx_b..idx_b + len]
454}
455
456pub fn verify_cross_window_match(
460 tokens1: &[Token],
461 tokens2: &[Token],
462 idx1: usize,
463 idx2: usize,
464 len: usize,
465) -> bool {
466 if idx1 + len > tokens1.len() || idx2 + len > tokens2.len() {
467 return false;
468 }
469 tokens1[idx1..idx1 + len] == tokens2[idx2..idx2 + len]
470}
471
472pub fn extend_match(
487 tokens1: &[Token],
488 tokens2: &[Token],
489 pos1: usize,
490 pos2: usize,
491 initial_len: usize,
492) -> usize {
493 let mut extension = 0;
494 while (pos1 + initial_len + extension < tokens1.len())
495 && (pos2 + initial_len + extension < tokens2.len())
496 && (tokens1[pos1 + initial_len + extension] == tokens2[pos2 + initial_len + extension])
497 {
498 extension += 1;
499 }
500 extension
501}
502
503fn get_keyword_set() -> &'static [&'static str] {
505 &[
506 "as",
508 "break",
509 "const",
510 "continue",
511 "crate",
512 "else",
513 "enum",
514 "extern",
515 "false",
516 "fn",
517 "for",
518 "if",
519 "impl",
520 "in",
521 "let",
522 "loop",
523 "match",
524 "mod",
525 "move",
526 "mut",
527 "pub",
528 "ref",
529 "return",
530 "self",
531 "Self",
532 "static",
533 "struct",
534 "super",
535 "trait",
536 "true",
537 "type",
538 "unsafe",
539 "use",
540 "where",
541 "while",
542 "async",
543 "await",
544 "dyn",
545 "and",
547 "assert",
548 "class",
549 "def",
550 "del",
551 "elif",
552 "except",
553 "finally",
554 "from",
555 "global",
556 "import",
557 "is",
558 "lambda",
559 "nonlocal",
560 "not",
561 "or",
562 "pass",
563 "raise",
564 "try",
565 "with",
566 "yield",
567 "await",
569 "case",
570 "catch",
571 "class",
572 "const",
573 "continue",
574 "debugger",
575 "default",
576 "delete",
577 "do",
578 "else",
579 "export",
580 "extends",
581 "finally",
582 "for",
583 "function",
584 "if",
585 "import",
586 "in",
587 "instanceof",
588 "let",
589 "new",
590 "return",
591 "super",
592 "switch",
593 "this",
594 "throw",
595 "try",
596 "typeof",
597 "var",
598 "void",
599 "while",
600 "with",
601 "yield",
602 ]
603}
604
605fn is_operator(s: &str) -> bool {
607 matches!(
608 s,
609 "+" | "-"
610 | "*"
611 | "/"
612 | "%"
613 | "="
614 | "=="
615 | "!="
616 | "<"
617 | ">"
618 | "<="
619 | ">="
620 | "&&"
621 | "||"
622 | "!"
623 | "&"
624 | "|"
625 | "^"
626 | "<<"
627 | ">>"
628 | "+="
629 | "-="
630 | "*="
631 | "/="
632 | "=>"
633 | "->"
634 | "::"
635 | "."
636 )
637}
638
639fn is_punctuation(ch: char) -> bool {
641 matches!(
642 ch,
643 '(' | ')' | '{' | '}' | '[' | ']' | ';' | ':' | ',' | '?'
644 )
645}
646
647pub fn compute_token_similarity(tokens1: &[Token], tokens2: &[Token]) -> f64 {
663 if tokens1.is_empty() && tokens2.is_empty() {
664 return 1.0;
665 }
666 if tokens1.is_empty() || tokens2.is_empty() {
667 return 0.0;
668 }
669
670 let distance = compute_token_edit_distance(tokens1, tokens2);
672 let max_len = tokens1.len().max(tokens2.len());
673
674 let similarity = 1.0 - (distance as f64 / max_len as f64);
676 similarity.clamp(0.0, 1.0)
677}
678
679pub fn compute_token_edit_distance(tokens1: &[Token], tokens2: &[Token]) -> usize {
692 let len1 = tokens1.len();
693 let len2 = tokens2.len();
694
695 if len1 == 0 {
696 return len2;
697 }
698 if len2 == 0 {
699 return len1;
700 }
701
702 let mut prev_row: Vec<usize> = (0..=len2).collect();
704 let mut curr_row: Vec<usize> = vec![0; len2 + 1];
705
706 for i in 1..=len1 {
707 curr_row[0] = i;
708
709 for j in 1..=len2 {
710 let cost = if tokens1[i - 1] == tokens2[j - 1] {
711 0
712 } else {
713 1
714 };
715
716 curr_row[j] = (prev_row[j - 1] + cost) .min(prev_row[j] + 1) .min(curr_row[j - 1] + 1); }
720
721 std::mem::swap(&mut prev_row, &mut curr_row);
722 }
723
724 prev_row[len2]
725}
726
727#[allow(clippy::too_many_arguments)]
735fn explore_length_variants(
736 tokens1: &[Token],
737 tokens2: &[Token],
738 i: usize,
739 j: usize,
740 window_size: usize,
741 base_similarity: f64,
742 length_delta_limit: usize,
743 tolerance: f64,
744) -> Option<(usize, usize, f64)> {
745 let max_extra1 = length_delta_limit.min(tokens1.len().saturating_sub(i + window_size));
746 let max_extra2 = length_delta_limit.min(tokens2.len().saturating_sub(j + window_size));
747
748 let mut best_match: Option<(usize, usize, f64)> = None;
749
750 for extra1 in 0..=max_extra1 {
751 let len1 = window_size + extra1;
752 let window1 = &tokens1[i..i + len1];
753
754 for extra2 in 0..=max_extra2 {
755 let len2 = window_size + extra2;
756
757 if len1.abs_diff(len2) > length_delta_limit {
759 continue;
760 }
761
762 let max_len = len1.max(len2);
764 let min_distance = len1.abs_diff(len2);
765 let max_allowed_distance = ((1.0 - tolerance) * max_len as f64).ceil() as usize;
766
767 if min_distance > max_allowed_distance {
768 continue;
769 }
770
771 let candidate_similarity = if extra1 == 0 && extra2 == 0 {
773 base_similarity
774 } else {
775 compute_token_similarity(window1, &tokens2[j..j + len2])
776 };
777
778 if candidate_similarity >= tolerance {
779 best_match = match best_match {
780 None => Some((len1, len2, candidate_similarity)),
781 Some((best_len1, best_len2, best_sim)) => {
782 let better_similarity = candidate_similarity > best_sim + f64::EPSILON;
783 let better_coverage =
784 !better_similarity && len1.max(len2) > best_len1.max(best_len2);
785
786 if better_similarity || better_coverage {
787 Some((len1, len2, candidate_similarity))
788 } else {
789 best_match
790 }
791 }
792 };
793 }
794 }
795 }
796
797 best_match
798}
799
800pub fn detect_type3_clones(
816 tokens1: &[Token],
817 tokens2: &[Token],
818 window_size: usize,
819 tolerance: f64,
820) -> Vec<CloneMatch> {
821 let mut matches = Vec::new();
822
823 let len_ratio = tokens1.len() as f64 / tokens2.len().max(1) as f64;
825 if !(0.8..=1.2).contains(&len_ratio) {
826 return matches;
827 }
828 if tokens1.len() < window_size || tokens2.len() < window_size || window_size == 0 {
829 return matches;
830 }
831
832 let base_delta = ((1.0 - tolerance) * window_size as f64).ceil() as usize;
835 let length_delta_limit = (base_delta + 2).max(1).min(window_size).min(20);
836 let similarity_floor = (tolerance - 0.3).max(0.0);
838
839 let mut i = 0;
841 while i + window_size <= tokens1.len() {
842 let mut j = 0;
843 while j + window_size <= tokens2.len() {
844 let window1 = &tokens1[i..i + window_size];
846 let window2 = &tokens2[j..j + window_size];
847 let similarity = compute_token_similarity(window1, window2);
848
849 if similarity < similarity_floor {
850 j += 1;
851 continue;
852 }
853
854 let best_match = explore_length_variants(
856 tokens1,
857 tokens2,
858 i,
859 j,
860 window_size,
861 similarity,
862 length_delta_limit,
863 tolerance,
864 );
865
866 if let Some((source_len, target_len, final_similarity)) = best_match {
867 matches.push(CloneMatch {
868 source_start: i,
869 target_start: j,
870 length: source_len,
871 target_length: target_len,
872 similarity: final_similarity,
873 });
874
875 let skip = target_len.saturating_sub(window_size).max(1);
877 j += skip;
878 } else {
879 j += 1;
880 }
881 }
882 i += 1;
883 }
884
885 matches
886}
887
888#[cfg(test)]
889mod tests {
890 use super::*;
891
892 #[test]
893 fn test_normalize_rust_code() {
894 let code = r#"
895 fn add(x: i32, y: i32) -> i32 {
896 x + y
897 }
898 "#;
899
900 let tokens = normalize(code);
901 assert!(!tokens.is_empty());
902
903 assert!(tokens
905 .iter()
906 .any(|t| matches!(t, Token::Keyword(k) if k == "fn")));
907
908 assert!(tokens.contains(&Token::Identifier));
910 }
911
912 #[test]
913 fn test_normalize_python_code() {
914 let code = r#"
915 def greet(name):
916 return f"Hello, {name}!"
917 "#;
918
919 let tokens = normalize(code);
920 assert!(!tokens.is_empty());
921
922 assert!(tokens
924 .iter()
925 .any(|t| matches!(t, Token::Keyword(k) if k == "def")));
926 assert!(tokens
927 .iter()
928 .any(|t| matches!(t, Token::Keyword(k) if k == "return")));
929
930 assert!(tokens.contains(&Token::StringLiteral));
932 }
933
934 #[test]
935 fn test_normalize_javascript_code() {
936 let code = r#"
937 function multiply(a, b) {
938 return a * b;
939 }
940 "#;
941
942 let tokens = normalize(code);
943 assert!(!tokens.is_empty());
944
945 assert!(tokens
947 .iter()
948 .any(|t| matches!(t, Token::Keyword(k) if k == "function")));
949 assert!(tokens
950 .iter()
951 .any(|t| matches!(t, Token::Keyword(k) if k == "return")));
952 }
953
954 #[test]
955 fn test_normalize_ignores_comments() {
956 let code = r#"
957 // This is a comment
958 fn test() {
959 /* Multi-line
960 comment */
961 let x = 5; // inline comment
962 }
963 "#;
964
965 let tokens = normalize(code);
966
967 for token in &tokens {
969 if let Token::Identifier = token {
970 } else if let Token::Keyword(_) = token {
972 }
974 }
975 }
976
977 #[test]
978 fn test_rolling_hash_creation() {
979 let hasher = RollingHash::new(50);
980 assert_eq!(hasher.window_size(), 50);
981 assert_eq!(hasher.current_hash(), None);
982 }
983
984 #[test]
985 fn test_rolling_hash_basic() {
986 let mut hasher = RollingHash::new(3);
987
988 assert_eq!(hasher.roll(1), None); assert_eq!(hasher.roll(2), None); let hash1 = hasher.roll(3); assert!(hash1.is_some());
994
995 let hash2 = hasher.roll(4); assert!(hash2.is_some());
997
998 assert_ne!(hash1, hash2);
1000 }
1001
1002 #[test]
1003 fn test_compute_rolling_hashes() {
1004 let tokens = vec![
1005 Token::Keyword("fn".to_string()),
1006 Token::Identifier,
1007 Token::Punctuation("(".to_string()),
1008 Token::Identifier,
1009 Token::Punctuation(")".to_string()),
1010 ];
1011
1012 let hashes = compute_rolling_hashes(&tokens, 3);
1013 assert_eq!(hashes.len(), 3); }
1015
1016 #[test]
1017 fn test_hash_token_consistency() {
1018 let token1 = Token::Identifier;
1019 let token2 = Token::Identifier;
1020
1021 assert_eq!(hash_token(&token1), hash_token(&token2));
1022 }
1023
1024 #[test]
1025 fn test_token_as_hash_string() {
1026 assert_eq!(Token::Identifier.as_hash_string(), "$$ID");
1027 assert_eq!(Token::StringLiteral.as_hash_string(), "$$STR");
1028 assert_eq!(Token::NumberLiteral.as_hash_string(), "$$NUM");
1029 assert_eq!(Token::Keyword("fn".to_string()).as_hash_string(), "fn");
1030 }
1031}