use regex::Regex;
use std::collections::HashMap;
pub fn normalize_code(code: &str) -> String {
let mut normalized = code.to_string();
normalized = remove_single_line_comments(&normalized);
normalized = remove_multi_line_comments(&normalized);
normalized = normalize_whitespace(&normalized);
normalized = normalized
.lines()
.filter(|line| !line.trim().is_empty())
.collect::<Vec<_>>()
.join("\n");
normalized
}
pub fn normalize_with_variables(code: &str) -> String {
let mut normalized = normalize_code(code);
normalized = rename_variables(&normalized);
normalized = normalize_strings(&normalized);
normalized = normalize_numbers(&normalized);
normalized
}
fn remove_single_line_comments(code: &str) -> String {
let patterns = [
r"//.*$", r"#(?![!\[])[^\n]*$", ];
let mut result = code.to_string();
for pattern in &patterns {
if let Ok(re) = Regex::new(&format!("(?m){}", pattern)) {
result = re.replace_all(&result, "").to_string();
}
}
result
}
fn remove_multi_line_comments(code: &str) -> String {
let patterns = [
r"/\*[\s\S]*?\*/", r#"'''[\s\S]*?'''"#, r#""""[\s\S]*?""""#, ];
let mut result = code.to_string();
for pattern in &patterns {
if let Ok(re) = Regex::new(pattern) {
result = re.replace_all(&result, "").to_string();
}
}
result
}
fn normalize_whitespace(code: &str) -> String {
let re = Regex::new(r"[ \t]+").unwrap();
let result = re.replace_all(code, " ");
result
.lines()
.map(|line| line.trim())
.collect::<Vec<_>>()
.join("\n")
}
fn rename_variables(code: &str) -> String {
let identifier_re = Regex::new(r"\b([a-zA-Z_][a-zA-Z0-9_]*)\b").unwrap();
let mut var_map: HashMap<String, String> = HashMap::new();
let mut counter = 1;
let keywords = [
"fn",
"function",
"def",
"class",
"struct",
"enum",
"trait",
"if",
"else",
"for",
"while",
"loop",
"match",
"switch",
"return",
"break",
"continue",
"let",
"const",
"var",
"pub",
"private",
"public",
"protected",
"static",
"async",
"await",
"yield",
"import",
"from",
"use",
"true",
"false",
"null",
"nil",
"None",
"Some",
];
identifier_re
.replace_all(code, |caps: ®ex::Captures| {
let ident = caps.get(1).unwrap().as_str();
if keywords.contains(&ident) {
return ident.to_string();
}
var_map
.entry(ident.to_string())
.or_insert_with(|| {
let placeholder = format!("v{}", counter);
counter += 1;
placeholder
})
.clone()
})
.to_string()
}
fn normalize_strings(code: &str) -> String {
let string_re = Regex::new(r#""[^"]*"|'[^']*'"#).unwrap();
string_re.replace_all(code, "\"STR\"").to_string()
}
fn normalize_numbers(code: &str) -> String {
let number_re = Regex::new(r"\b\d+\.?\d*\b").unwrap();
number_re.replace_all(code, "NUM").to_string()
}
pub fn calculate_hash(code: &str) -> u64 {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let mut hasher = DefaultHasher::new();
code.hash(&mut hasher);
hasher.finish()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_normalize_code() {
let code = r#"
// This is a comment
fn test() {
let x = 5; // inline comment
/* multi
line */
let y = 10;
}
"#;
let normalized = normalize_code(code);
assert!(!normalized.contains("//"));
assert!(!normalized.contains("/*"));
assert!(!normalized.contains("*/"));
}
#[test]
fn test_normalize_whitespace() {
let code = "fn test() { }";
let normalized = normalize_whitespace(code);
assert_eq!(normalized, "fn test() { }");
}
#[test]
fn test_rename_variables() {
let code = "let user = 5; let data = user + 10;";
let normalized = rename_variables(code);
assert!(normalized.contains("v1"));
assert!(normalized.contains("v2"));
assert!(!normalized.contains("user"));
assert!(!normalized.contains("data"));
}
#[test]
fn test_normalize_strings() {
let code = r#"let msg = "hello world"; let x = 'test';"#;
let normalized = normalize_strings(code);
assert!(normalized.contains("\"STR\""));
assert!(!normalized.contains("hello"));
}
#[test]
fn test_normalize_numbers() {
let code = "let x = 42; let y = 3.14;";
let normalized = normalize_numbers(code);
assert!(normalized.contains("NUM"));
assert!(!normalized.contains("42"));
assert!(!normalized.contains("3.14"));
}
}