use std::num::Wrapping;
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum Token {
Keyword(String),
Identifier,
StringLiteral,
NumberLiteral,
Operator(String),
Punctuation(String),
}
impl Token {
pub fn as_hash_string(&self) -> &str {
match self {
Token::Keyword(kw) => kw.as_str(),
Token::Identifier => "$$ID",
Token::StringLiteral => "$$STR",
Token::NumberLiteral => "$$NUM",
Token::Operator(op) => op.as_str(),
Token::Punctuation(p) => p.as_str(),
}
}
}
pub fn normalize(code: &str) -> Vec<Token> {
let keywords = get_keyword_set();
let mut tokens = Vec::new();
let chars: Vec<char> = code.chars().collect();
let mut i = 0;
while i < chars.len() {
let ch = chars[i];
if ch.is_whitespace() {
i += 1;
continue;
}
if (ch == '/' && i + 1 < chars.len() && chars[i + 1] == '/')
|| ch == '#'
{
while i < chars.len() && chars[i] != '\n' {
i += 1;
}
continue;
}
if ch == '/' && i + 1 < chars.len() && chars[i + 1] == '*' {
i += 2;
while i + 1 < chars.len() {
if chars[i] == '*' && chars[i + 1] == '/' {
i += 2;
break;
}
i += 1;
}
continue;
}
if ch == '"' || ch == '\'' {
let quote = ch;
i += 1;
while i < chars.len() {
if chars[i] == '\\' {
i += 2; continue;
}
if chars[i] == quote {
i += 1;
break;
}
i += 1;
}
tokens.push(Token::StringLiteral);
continue;
}
if ch.is_ascii_digit() {
while i < chars.len() && (chars[i].is_ascii_alphanumeric() || chars[i] == '.') {
i += 1;
}
tokens.push(Token::NumberLiteral);
continue;
}
if ch.is_alphabetic() || ch == '_' || ch == '$' {
let start = i;
while i < chars.len() && (chars[i].is_alphanumeric() || chars[i] == '_' || chars[i] == '$') {
i += 1;
}
let word: String = chars[start..i].iter().collect();
if keywords.contains(&word.as_str()) {
tokens.push(Token::Keyword(word));
} else {
tokens.push(Token::Identifier);
}
continue;
}
if i + 1 < chars.len() {
let two_char: String = chars[i..i + 2].iter().collect();
if is_operator(&two_char) {
tokens.push(Token::Operator(two_char));
i += 2;
continue;
}
}
let single = ch.to_string();
if is_operator(&single) {
tokens.push(Token::Operator(single));
} else if is_punctuation(ch) {
tokens.push(Token::Punctuation(single));
}
i += 1;
}
tokens
}
#[derive(Debug, Clone)]
pub struct RollingHash {
window_size: usize,
base: Wrapping<u64>,
hash: Wrapping<u64>,
base_power: Wrapping<u64>,
window: Vec<u64>,
}
impl RollingHash {
pub fn new(window_size: usize) -> Self {
let base = Wrapping(257u64);
let mut base_power = Wrapping(1u64);
for _ in 0..window_size {
base_power *= base;
}
Self {
window_size,
base,
hash: Wrapping(0),
base_power,
window: Vec::with_capacity(window_size),
}
}
pub fn roll(&mut self, token_hash: u64) -> Option<u64> {
if self.window.len() < self.window_size {
self.window.push(token_hash);
self.hash = self.hash * self.base + Wrapping(token_hash);
if self.window.len() == self.window_size {
Some(self.hash.0)
} else {
None
}
} else {
let old_token = self.window.remove(0);
self.window.push(token_hash);
self.hash = self.hash - Wrapping(old_token) * self.base_power;
self.hash = self.hash * self.base + Wrapping(token_hash);
Some(self.hash.0)
}
}
pub fn reset(&mut self) {
self.hash = Wrapping(0);
self.window.clear();
}
pub fn current_hash(&self) -> Option<u64> {
if self.window.len() == self.window_size {
Some(self.hash.0)
} else {
None
}
}
pub fn window_size(&self) -> usize {
self.window_size
}
}
pub fn compute_rolling_hashes(tokens: &[Token], window_size: usize) -> Vec<(u64, usize)> {
if tokens.len() < window_size {
return Vec::new();
}
let mut hasher = RollingHash::new(window_size);
let mut hashes = Vec::new();
for (idx, token) in tokens.iter().enumerate() {
let token_hash = hash_token(token);
if let Some(hash) = hasher.roll(token_hash) {
let start_index = idx.saturating_sub(window_size - 1);
hashes.push((hash, start_index));
}
}
hashes
}
fn hash_token(token: &Token) -> u64 {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let mut hasher = DefaultHasher::new();
token.as_hash_string().hash(&mut hasher);
hasher.finish()
}
#[derive(Debug, Clone)]
pub struct CloneMatch {
pub source_start: usize,
pub target_start: usize,
pub length: usize,
}
pub fn detect_duplicates_with_extension(tokens: &[Token], window_size: usize) -> Vec<CloneMatch> {
use std::collections::HashMap;
if tokens.len() < window_size {
return Vec::new();
}
let mut matches = Vec::new();
let mut hash_map: HashMap<u64, Vec<usize>> = HashMap::new();
let mut i = 0;
while i <= tokens.len().saturating_sub(window_size) {
let current_hash = compute_window_hash(&tokens[i..i + window_size]);
if let Some(prev_indices) = hash_map.get(¤t_hash) {
for &prev_index in prev_indices.iter() {
if prev_index >= i {
continue;
}
if verify_window_match(tokens, prev_index, i, window_size) {
let mut extension = 0;
while (i + window_size + extension < tokens.len())
&& (prev_index + window_size + extension < i)
&& (tokens[prev_index + window_size + extension]
== tokens[i + window_size + extension])
{
extension += 1;
}
let total_length = window_size + extension;
matches.push(CloneMatch {
source_start: prev_index,
target_start: i,
length: total_length,
});
i += extension.max(1);
break; }
}
}
hash_map.entry(current_hash).or_insert_with(Vec::new).push(i);
i += 1;
}
matches
}
fn compute_window_hash(window: &[Token]) -> u64 {
const BASE: u64 = 257;
const MODULUS: u64 = 1_000_000_007;
let mut hash: u64 = 0;
for token in window {
let token_hash = hash_token(token);
hash = (hash.wrapping_mul(BASE).wrapping_add(token_hash)) % MODULUS;
}
hash
}
fn verify_window_match(tokens: &[Token], idx_a: usize, idx_b: usize, len: usize) -> bool {
if idx_a + len > tokens.len() || idx_b + len > tokens.len() {
return false;
}
tokens[idx_a..idx_a + len] == tokens[idx_b..idx_b + len]
}
fn get_keyword_set() -> &'static [&'static str] {
&[
"as", "break", "const", "continue", "crate", "else", "enum", "extern",
"false", "fn", "for", "if", "impl", "in", "let", "loop", "match", "mod",
"move", "mut", "pub", "ref", "return", "self", "Self", "static", "struct",
"super", "trait", "true", "type", "unsafe", "use", "where", "while",
"async", "await", "dyn",
"and", "assert", "class", "def", "del", "elif", "except", "finally",
"from", "global", "import", "is", "lambda", "nonlocal", "not", "or",
"pass", "raise", "try", "with", "yield",
"await", "case", "catch", "class", "const", "continue", "debugger",
"default", "delete", "do", "else", "export", "extends", "finally",
"for", "function", "if", "import", "in", "instanceof", "let", "new",
"return", "super", "switch", "this", "throw", "try", "typeof", "var",
"void", "while", "with", "yield",
]
}
fn is_operator(s: &str) -> bool {
matches!(
s,
"+" | "-" | "*" | "/" | "%" | "=" | "==" | "!=" | "<" | ">" | "<=" | ">="
| "&&" | "||" | "!" | "&" | "|" | "^" | "<<" | ">>" | "+=" | "-="
| "*=" | "/=" | "=>" | "->" | "::" | "."
)
}
fn is_punctuation(ch: char) -> bool {
matches!(
ch,
'(' | ')' | '{' | '}' | '[' | ']' | ';' | ':' | ',' | '?'
)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_normalize_rust_code() {
let code = r#"
fn add(x: i32, y: i32) -> i32 {
x + y
}
"#;
let tokens = normalize(code);
assert!(tokens.len() > 0);
assert!(tokens.iter().any(|t| matches!(t, Token::Keyword(k) if k == "fn")));
assert!(tokens.contains(&Token::Identifier));
}
#[test]
fn test_normalize_python_code() {
let code = r#"
def greet(name):
return f"Hello, {name}!"
"#;
let tokens = normalize(code);
assert!(tokens.len() > 0);
assert!(tokens.iter().any(|t| matches!(t, Token::Keyword(k) if k == "def")));
assert!(tokens.iter().any(|t| matches!(t, Token::Keyword(k) if k == "return")));
assert!(tokens.contains(&Token::StringLiteral));
}
#[test]
fn test_normalize_javascript_code() {
let code = r#"
function multiply(a, b) {
return a * b;
}
"#;
let tokens = normalize(code);
assert!(tokens.len() > 0);
assert!(tokens.iter().any(|t| matches!(t, Token::Keyword(k) if k == "function")));
assert!(tokens.iter().any(|t| matches!(t, Token::Keyword(k) if k == "return")));
}
#[test]
fn test_normalize_ignores_comments() {
let code = r#"
// This is a comment
fn test() {
/* Multi-line
comment */
let x = 5; // inline comment
}
"#;
let tokens = normalize(code);
for token in &tokens {
if let Token::Identifier = token {
} else if let Token::Keyword(_) = token {
}
}
}
#[test]
fn test_rolling_hash_creation() {
let hasher = RollingHash::new(50);
assert_eq!(hasher.window_size(), 50);
assert_eq!(hasher.current_hash(), None);
}
#[test]
fn test_rolling_hash_basic() {
let mut hasher = RollingHash::new(3);
assert_eq!(hasher.roll(1), None); assert_eq!(hasher.roll(2), None);
let hash1 = hasher.roll(3); assert!(hash1.is_some());
let hash2 = hasher.roll(4); assert!(hash2.is_some());
assert_ne!(hash1, hash2);
}
#[test]
fn test_compute_rolling_hashes() {
let tokens = vec![
Token::Keyword("fn".to_string()),
Token::Identifier,
Token::Punctuation("(".to_string()),
Token::Identifier,
Token::Punctuation(")".to_string()),
];
let hashes = compute_rolling_hashes(&tokens, 3);
assert_eq!(hashes.len(), 3); }
#[test]
fn test_hash_token_consistency() {
let token1 = Token::Identifier;
let token2 = Token::Identifier;
assert_eq!(hash_token(&token1), hash_token(&token2));
}
#[test]
fn test_token_as_hash_string() {
assert_eq!(Token::Identifier.as_hash_string(), "$$ID");
assert_eq!(Token::StringLiteral.as_hash_string(), "$$STR");
assert_eq!(Token::NumberLiteral.as_hash_string(), "$$NUM");
assert_eq!(Token::Keyword("fn".to_string()).as_hash_string(), "fn");
}
}