use std::path::{Path, PathBuf};
use std::sync::Mutex;
use aho_corasick::AhoCorasick;
use fxhash::FxHashMap;
use once_cell::sync::Lazy;
use xxhash_rust::xxh3::xxh3_64;
use rayon::prelude::*;
use serde::{Deserialize, Serialize};
use tracing::{debug, trace};
use crate::callgraph::scanner::{ProjectScanner, ScanConfig};
use crate::error::{Result, BrrrError};
use crate::lang::LanguageRegistry;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum CloneType {
Type1,
#[allow(dead_code)]
Type2,
#[allow(dead_code)]
Type3,
}
impl Default for CloneType {
fn default() -> Self {
Self::Type1
}
}
impl std::fmt::Display for CloneType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Type1 => write!(f, "Type-1 (exact)"),
Self::Type2 => write!(f, "Type-2 (renamed)"),
Self::Type3 => write!(f, "Type-3 (near-miss)"),
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct CloneInstance {
pub file: PathBuf,
pub start_line: usize,
pub end_line: usize,
#[serde(skip_serializing_if = "Option::is_none")]
pub preview: Option<String>,
}
impl CloneInstance {
#[must_use]
pub fn new(file: PathBuf, start_line: usize, end_line: usize) -> Self {
Self {
file,
start_line,
end_line,
preview: None,
}
}
#[must_use]
pub fn with_preview(file: PathBuf, start_line: usize, end_line: usize, preview: String) -> Self {
Self {
file,
start_line,
end_line,
preview: Some(preview),
}
}
#[must_use]
pub fn line_count(&self) -> usize {
self.end_line.saturating_sub(self.start_line) + 1
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Clone {
pub clone_type: CloneType,
pub instances: Vec<CloneInstance>,
pub line_count: usize,
pub similarity: f64,
#[serde(skip_serializing_if = "Option::is_none")]
pub fingerprint: Option<u64>,
}
impl Clone {
#[must_use]
pub fn new_type1(instances: Vec<CloneInstance>, line_count: usize) -> Self {
Self {
clone_type: CloneType::Type1,
instances,
line_count,
similarity: 1.0,
fingerprint: None,
}
}
#[must_use]
pub fn with_fingerprint(mut self, fingerprint: u64) -> Self {
self.fingerprint = Some(fingerprint);
self
}
#[must_use]
pub fn duplicated_lines(&self) -> usize {
if self.instances.len() <= 1 {
return 0;
}
self.line_count * (self.instances.len() - 1)
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CloneConfig {
pub min_lines: usize,
pub min_tokens: usize,
pub ignore_patterns: Vec<String>,
pub exclude_license_headers: bool,
pub exclude_test_fixtures: bool,
pub exclude_generated: bool,
pub max_file_size: u64,
pub language: Option<String>,
}
impl Default for CloneConfig {
fn default() -> Self {
Self {
min_lines: 6,
min_tokens: 50,
ignore_patterns: Vec::new(),
exclude_license_headers: true,
exclude_test_fixtures: true,
exclude_generated: true,
max_file_size: 1024 * 1024, language: None,
}
}
}
impl CloneConfig {
#[must_use]
pub fn with_min_lines(mut self, min_lines: usize) -> Self {
self.min_lines = min_lines;
self
}
#[must_use]
pub fn with_language(mut self, lang: impl Into<String>) -> Self {
self.language = Some(lang.into());
self
}
#[must_use]
pub fn with_ignore_patterns(mut self, patterns: Vec<String>) -> Self {
self.ignore_patterns = patterns;
self
}
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct CloneStats {
pub files_scanned: usize,
pub files_with_clones: usize,
pub clone_groups: usize,
pub clone_instances: usize,
pub duplicated_lines: usize,
pub total_lines: usize,
pub duplication_percentage: f64,
pub files_skipped_size: usize,
pub files_skipped_excluded: usize,
}
impl CloneStats {
pub fn calculate_percentage(&mut self) {
if self.total_lines > 0 {
self.duplication_percentage =
(self.duplicated_lines as f64 / self.total_lines as f64) * 100.0;
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CloneError {
pub file: PathBuf,
pub message: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CloneAnalysis {
pub path: PathBuf,
pub config: CloneConfig,
pub clone_groups: Vec<Clone>,
pub stats: CloneStats,
#[serde(skip_serializing_if = "Vec::is_empty")]
pub errors: Vec<CloneError>,
}
struct RabinHasher {
hash: u64,
base: u64,
modulus: u64,
base_power: u64,
window_size: usize,
}
impl RabinHasher {
fn new(window_size: usize) -> Self {
let base: u64 = 31;
let modulus: u64 = (1u64 << 61) - 1;
let mut base_power = 1u64;
for _ in 0..window_size.saturating_sub(1) {
base_power = base_power.wrapping_mul(base) % modulus;
}
Self {
hash: 0,
base,
modulus,
base_power,
window_size,
}
}
fn push(&mut self, line_hash: u64) {
let reduced_hash = line_hash % self.modulus;
let shifted = (self.hash % self.modulus).wrapping_mul(self.base) % self.modulus;
self.hash = shifted.wrapping_add(reduced_hash) % self.modulus;
}
fn pop(&mut self, old_line_hash: u64) {
let reduced_hash = old_line_hash % self.modulus;
let remove = reduced_hash.wrapping_mul(self.base_power) % self.modulus;
if self.hash >= remove {
self.hash = self.hash - remove;
} else {
self.hash = self.modulus - (remove - self.hash);
}
}
#[inline]
fn fingerprint(&self) -> u64 {
self.hash
}
fn reset(&mut self) {
self.hash = 0;
}
}
#[inline]
fn hash_line(line: &str) -> u64 {
xxh3_64(line.as_bytes())
}
struct LineNormalizer<'a> {
single_line_comments: &'a [&'a str],
in_block_comment: bool,
block_end: Option<&'a str>,
}
impl<'a> LineNormalizer<'a> {
fn for_language(lang: Option<&str>) -> Self {
let single_line_comments = match lang {
Some("python") => &["#"][..],
Some("rust") => &["//"][..],
Some("go") => &["//"][..],
Some("java") | Some("c") | Some("cpp") | Some("csharp") | Some("kotlin") => &["//"][..],
Some("typescript") | Some("javascript") => &["//"][..],
Some("ruby") => &["#"][..],
Some("php") => &["//", "#"][..],
Some("lua") => &["--"][..],
Some("sql") => &["--"][..],
Some("shell") | Some("bash") => &["#"][..],
_ => &["//", "#"][..], };
Self {
single_line_comments,
in_block_comment: false,
block_end: None,
}
}
fn normalize_line(&mut self, line: &str) -> Option<String> {
let mut result = line.to_string();
if self.in_block_comment {
if let Some(end) = self.block_end {
if let Some(pos) = result.find(end) {
result = result[pos + end.len()..].to_string();
self.in_block_comment = false;
self.block_end = None;
} else {
return None;
}
}
}
result = self.strip_inline_block_comments(&result);
result = self.handle_block_comment_start(&result);
for marker in self.single_line_comments {
if let Some(pos) = result.find(marker) {
if !self.is_in_string_literal(&result, pos) {
result = result[..pos].to_string();
}
}
}
let normalized: String = result
.split_whitespace()
.collect::<Vec<_>>()
.join(" ");
if normalized.is_empty() {
return None;
}
Some(normalized)
}
fn strip_inline_block_comments(&self, line: &str) -> String {
let mut result = line.to_string();
while let Some(start) = result.find("/*") {
if let Some(end) = result[start..].find("*/") {
result = format!("{}{}", &result[..start], &result[start + end + 2..]);
} else {
break;
}
}
while let Some(start) = result.find("\"\"\"") {
if let Some(end) = result[start + 3..].find("\"\"\"") {
result = format!("{}{}", &result[..start], &result[start + end + 6..]);
} else {
break;
}
}
result
}
fn handle_block_comment_start(&mut self, line: &str) -> String {
let mut result = line.to_string();
if let Some(start) = result.find("/*") {
if result[start..].find("*/").is_none() {
self.in_block_comment = true;
self.block_end = Some("*/");
result = result[..start].to_string();
}
}
if let Some(start) = result.find("\"\"\"") {
if result[start + 3..].find("\"\"\"").is_none() {
self.in_block_comment = true;
self.block_end = Some("\"\"\"");
result = result[..start].to_string();
}
}
result
}
#[allow(clippy::cast_possible_truncation)]
fn is_in_string_literal(&self, line: &str, pos: usize) -> bool {
use wide::{u8x32, CmpEq};
let bytes = &line.as_bytes()[..pos];
let len = bytes.len();
if len == 0 {
return false;
}
let single_quote_vec = u8x32::splat(b'\'');
let double_quote_vec = u8x32::splat(b'"');
let mut single_count: u32 = 0;
let mut double_count: u32 = 0;
let chunks = len / 32;
for chunk_idx in 0..chunks {
let offset = chunk_idx * 32;
let chunk: [u8; 32] = bytes[offset..offset + 32]
.try_into()
.expect("chunk size is exactly 32");
let vec = u8x32::from(chunk);
let single_mask = vec.cmp_eq(single_quote_vec);
let double_mask = vec.cmp_eq(double_quote_vec);
for b in single_mask.to_array() {
if b != 0 {
single_count += 1;
}
}
for b in double_mask.to_array() {
if b != 0 {
double_count += 1;
}
}
}
for &b in &bytes[chunks * 32..] {
if b == b'\'' {
single_count += 1;
} else if b == b'"' {
double_count += 1;
}
}
let mut escaped_single: u32 = 0;
let mut escaped_double: u32 = 0;
for backslash_pos in memchr::memchr_iter(b'\\', bytes) {
if let Some(&next_byte) = bytes.get(backslash_pos + 1) {
match next_byte {
b'\'' => escaped_single += 1,
b'"' => escaped_double += 1,
_ => {}
}
}
}
let unescaped_single = single_count.saturating_sub(escaped_single);
let unescaped_double = double_count.saturating_sub(escaped_double);
(unescaped_single % 2 == 1) || (unescaped_double % 2 == 1)
}
}
const LICENSE_PATTERNS: &[&str] = &[
"copyright",
"licensed under",
"license",
"mit license",
"apache license",
"gnu general public",
"gpl",
"lgpl",
"bsd license",
"mozilla public license",
"mpl",
"permission is hereby granted",
"this software is provided",
"all rights reserved",
"spdx-license-identifier",
];
static LICENSE_MATCHER: Lazy<AhoCorasick> = Lazy::new(|| {
AhoCorasick::builder()
.ascii_case_insensitive(true)
.build(LICENSE_PATTERNS)
.expect("LICENSE_PATTERNS are valid")
});
const GENERATED_PATTERNS: &[&str] = &[
"generated by",
"auto-generated",
"autogenerated",
"do not edit",
"do not modify",
"generated from",
"@generated",
"this file is generated",
"machine generated",
"automatically generated",
];
#[inline]
fn is_license_content(content: &str) -> bool {
LICENSE_MATCHER.is_match(content)
}
fn is_generated_file(path: &Path, first_lines: &[String]) -> bool {
let file_name = path
.file_name()
.and_then(|n| n.to_str())
.unwrap_or("")
.to_lowercase();
if file_name.contains("generated")
|| file_name.contains(".gen.")
|| file_name.ends_with(".pb.go")
|| file_name.ends_with("_pb2.py")
|| file_name.ends_with(".g.dart")
|| file_name.contains("_generated")
{
return true;
}
let header = first_lines.iter().take(10).map(|l| l.to_lowercase()).collect::<Vec<_>>().join(" ");
GENERATED_PATTERNS.iter().any(|pattern| header.contains(pattern))
}
fn is_test_fixture(path: &Path) -> bool {
let path_str = path.to_string_lossy().to_lowercase();
path_str.contains("/fixtures/")
|| path_str.contains("/testdata/")
|| path_str.contains("/test_data/")
|| path_str.contains("/__fixtures__/")
|| path_str.contains("/testfiles/")
|| path_str.contains("/samples/")
|| path_str.contains("/snapshots/")
|| path_str.contains("/__snapshots__/")
}
const MIN_FILES_FOR_PARALLEL: usize = 10;
pub struct TextualCloneDetector {
config: CloneConfig,
}
impl TextualCloneDetector {
#[must_use]
pub fn new(config: CloneConfig) -> Self {
Self { config }
}
#[must_use]
pub fn with_defaults() -> Self {
Self::new(CloneConfig::default())
}
pub fn detect(&self, path: impl AsRef<Path>) -> Result<CloneAnalysis> {
let path = path.as_ref();
debug!("Starting clone detection in {:?}", path);
let mut scan_config = if let Some(ref lang) = self.config.language {
ScanConfig::for_language(lang)
} else {
ScanConfig::default()
};
scan_config.collect_metadata = true;
let scanner = ProjectScanner::new(path.to_string_lossy().as_ref())?;
let scan_result = scanner.scan_with_config(&scan_config)?;
let files: Vec<PathBuf> = if !scan_result.metadata.is_empty() {
scan_result
.metadata
.into_iter()
.filter(|meta| meta.size <= self.config.max_file_size)
.map(|meta| meta.path)
.collect()
} else {
scan_result
.files
.into_iter()
.filter(|p| {
std::fs::metadata(p)
.map(|m| m.len() <= self.config.max_file_size)
.unwrap_or(false)
})
.collect()
};
debug!("Found {} files to analyze", files.len());
self.detect_in_files(path.to_path_buf(), &files)
}
pub fn detect_in_files(&self, base_path: PathBuf, files: &[PathBuf]) -> Result<CloneAnalysis> {
let mut stats = CloneStats::default();
stats.files_scanned = files.len();
let fingerprints: Mutex<FxHashMap<u64, Vec<CloneInstance>>> = Mutex::new(FxHashMap::default());
let errors: Mutex<Vec<CloneError>> = Mutex::new(Vec::new());
let total_lines: Mutex<usize> = Mutex::new(0);
let files_excluded: Mutex<usize> = Mutex::new(0);
let window_size = self.config.min_lines;
let process_file = |file: &PathBuf| {
match self.process_file(file, window_size) {
Ok(Some((file_fingerprints, line_count))) => {
let mut fp_guard = fingerprints.lock().unwrap_or_else(|e| e.into_inner());
for (hash, instance) in file_fingerprints {
fp_guard.entry(hash).or_default().push(instance);
}
drop(fp_guard);
*total_lines.lock().unwrap_or_else(|e| e.into_inner()) += line_count;
}
Ok(None) => {
*files_excluded.lock().unwrap_or_else(|e| e.into_inner()) += 1;
}
Err(e) => {
errors.lock().unwrap_or_else(|e| e.into_inner()).push(CloneError {
file: file.clone(),
message: e.to_string(),
});
}
}
};
if files.len() >= MIN_FILES_FOR_PARALLEL {
files.par_iter().for_each(process_file);
} else {
files.iter().for_each(process_file);
}
let fingerprints = fingerprints.into_inner().unwrap_or_else(|e| e.into_inner());
let errors = errors.into_inner().unwrap_or_else(|e| e.into_inner());
stats.total_lines = *total_lines.lock().unwrap_or_else(|e| e.into_inner());
stats.files_skipped_excluded = *files_excluded.lock().unwrap_or_else(|e| e.into_inner());
let mut clone_groups = Vec::new();
let mut files_with_clones = std::collections::HashSet::new();
for (hash, instances) in fingerprints {
if instances.len() >= 2 {
for inst in &instances {
files_with_clones.insert(inst.file.clone());
}
let line_count = instances.first().map(|i| i.line_count()).unwrap_or(window_size);
let clone = Clone::new_type1(instances, line_count).with_fingerprint(hash);
clone_groups.push(clone);
}
}
clone_groups.sort_by(|a, b| b.duplicated_lines().cmp(&a.duplicated_lines()));
stats.clone_groups = clone_groups.len();
stats.clone_instances = clone_groups.iter().map(|c| c.instances.len()).sum();
stats.duplicated_lines = clone_groups.iter().map(|c| c.duplicated_lines()).sum();
stats.files_with_clones = files_with_clones.len();
stats.calculate_percentage();
debug!(
"Clone detection complete: {} groups, {} duplicated lines ({:.1}%)",
stats.clone_groups, stats.duplicated_lines, stats.duplication_percentage
);
Ok(CloneAnalysis {
path: base_path,
config: self.config.clone(),
clone_groups,
stats,
errors,
})
}
fn process_file(
&self,
path: &Path,
window_size: usize,
) -> Result<Option<(Vec<(u64, CloneInstance)>, usize)>> {
let content = std::fs::read_to_string(path)
.map_err(|e| BrrrError::io_with_path(e, path))?;
let original_lines: Vec<&str> = content.lines().collect();
if self.config.exclude_test_fixtures && is_test_fixture(path) {
trace!("Skipping test fixture: {:?}", path);
return Ok(None);
}
let lang = self.config.language.as_deref().or_else(|| {
LanguageRegistry::global()
.detect_language(path)
.map(|l| l.name())
});
let mut normalizer = LineNormalizer::for_language(lang);
let mut normalized_lines: Vec<(usize, String)> = Vec::new();
for (idx, line) in original_lines.iter().enumerate() {
if let Some(normalized) = normalizer.normalize_line(line) {
normalized_lines.push((idx + 1, normalized)); }
}
if self.config.exclude_generated {
let first_lines: Vec<String> = original_lines.iter().take(15).map(|s| s.to_string()).collect();
if is_generated_file(path, &first_lines) {
trace!("Skipping generated file: {:?}", path);
return Ok(None);
}
}
if self.config.exclude_license_headers {
let license_lines = normalized_lines.iter()
.take(30)
.filter(|(_, l)| is_license_content(l))
.count();
if license_lines > 15 {
trace!("Skipping license-heavy file: {:?}", path);
return Ok(None);
}
}
if normalized_lines.len() < window_size {
return Ok(Some((Vec::new(), original_lines.len())));
}
let line_hashes: Vec<u64> = normalized_lines.iter()
.map(|(_, l)| hash_line(l))
.collect();
let mut hasher = RabinHasher::new(window_size);
let mut fingerprints: Vec<(u64, CloneInstance)> = Vec::new();
for hash in line_hashes.iter().take(window_size) {
hasher.push(*hash);
}
let start_line = normalized_lines[0].0;
let end_line = normalized_lines[window_size - 1].0;
let preview = normalized_lines[0].1.chars().take(60).collect::<String>();
let block_is_license = self.config.exclude_license_headers &&
normalized_lines[..window_size].iter()
.filter(|(_, l)| is_license_content(l))
.count() > window_size / 2;
if !block_is_license {
fingerprints.push((
hasher.fingerprint(),
CloneInstance::with_preview(path.to_path_buf(), start_line, end_line, preview),
));
}
for i in window_size..normalized_lines.len() {
hasher.pop(line_hashes[i - window_size]);
hasher.push(line_hashes[i]);
let start_line = normalized_lines[i - window_size + 1].0;
let end_line = normalized_lines[i].0;
let preview = normalized_lines[i - window_size + 1].1.chars().take(60).collect::<String>();
let block_is_license = self.config.exclude_license_headers &&
normalized_lines[i - window_size + 1..=i].iter()
.filter(|(_, l)| is_license_content(l))
.count() > window_size / 2;
if !block_is_license {
fingerprints.push((
hasher.fingerprint(),
CloneInstance::with_preview(path.to_path_buf(), start_line, end_line, preview),
));
}
}
Ok(Some((fingerprints, original_lines.len())))
}
}
pub fn detect_clones(
path: impl AsRef<Path>,
min_lines: Option<usize>,
language: Option<&str>,
) -> Result<CloneAnalysis> {
let mut config = CloneConfig::default();
if let Some(min) = min_lines {
config.min_lines = min;
}
if let Some(lang) = language {
config.language = Some(lang.to_string());
}
let detector = TextualCloneDetector::new(config);
detector.detect(path)
}
pub fn format_clone_summary(analysis: &CloneAnalysis) -> String {
let mut output = String::new();
output.push_str(&format!(
"Clone Detection Results for {}\n",
analysis.path.display()
));
output.push_str(&format!("{}\n\n", "=".repeat(60)));
output.push_str("Statistics:\n");
output.push_str(&format!(" Files scanned: {}\n", analysis.stats.files_scanned));
output.push_str(&format!(" Files with clones: {}\n", analysis.stats.files_with_clones));
output.push_str(&format!(" Clone groups: {}\n", analysis.stats.clone_groups));
output.push_str(&format!(" Clone instances: {}\n", analysis.stats.clone_instances));
output.push_str(&format!(" Total lines: {}\n", analysis.stats.total_lines));
output.push_str(&format!(" Duplicated lines: {}\n", analysis.stats.duplicated_lines));
output.push_str(&format!(
" Duplication: {:.1}%\n\n",
analysis.stats.duplication_percentage
));
if !analysis.clone_groups.is_empty() {
output.push_str("Top Clone Groups:\n");
for (i, clone) in analysis.clone_groups.iter().take(10).enumerate() {
output.push_str(&format!(
"\n{}. {} instances, {} lines each ({} duplicated lines)\n",
i + 1,
clone.instances.len(),
clone.line_count,
clone.duplicated_lines()
));
for instance in &clone.instances {
output.push_str(&format!(
" {}:{}-{}\n",
instance.file.display(),
instance.start_line,
instance.end_line
));
if let Some(ref preview) = instance.preview {
output.push_str(&format!(" > {}\n", preview));
}
}
}
}
output
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs;
use tempfile::TempDir;
#[test]
fn test_rabin_hasher_basic() {
let mut hasher = RabinHasher::new(3);
hasher.push(hash_line("line 1"));
hasher.push(hash_line("line 2"));
hasher.push(hash_line("line 3"));
let fp1 = hasher.fingerprint();
hasher.reset();
hasher.push(hash_line("line 1"));
hasher.push(hash_line("line 2"));
hasher.push(hash_line("line 3"));
let fp2 = hasher.fingerprint();
assert_eq!(fp1, fp2, "Same lines should produce same fingerprint");
}
#[test]
fn test_rabin_hasher_sliding() {
let mut hasher = RabinHasher::new(2);
let h1 = hash_line("a");
let h2 = hash_line("b");
let h3 = hash_line("c");
hasher.push(h1);
hasher.push(h2);
let fp_ab = hasher.fingerprint();
hasher.pop(h1);
hasher.push(h3);
let fp_bc = hasher.fingerprint();
hasher.reset();
hasher.push(h2);
hasher.push(h3);
let fp_bc_direct = hasher.fingerprint();
assert_ne!(fp_ab, fp_bc, "Different windows should have different fingerprints");
assert_eq!(fp_bc, fp_bc_direct, "Sliding and direct should match");
}
#[test]
fn test_line_normalizer_comments() {
let mut normalizer = LineNormalizer::for_language(Some("rust"));
assert_eq!(
normalizer.normalize_line("let x = 5; // this is a comment"),
Some("let x = 5;".to_string())
);
assert_eq!(
normalizer.normalize_line(" let y = 10; "),
Some("let y = 10;".to_string())
);
assert_eq!(normalizer.normalize_line("// just a comment"), None);
assert_eq!(normalizer.normalize_line(" "), None);
}
#[test]
fn test_line_normalizer_python() {
let mut normalizer = LineNormalizer::for_language(Some("python"));
assert_eq!(
normalizer.normalize_line("x = 5 # assign x"),
Some("x = 5".to_string())
);
assert_eq!(normalizer.normalize_line("# just a comment"), None);
}
#[test]
fn test_line_normalizer_block_comments() {
let mut normalizer = LineNormalizer::for_language(Some("rust"));
assert_eq!(
normalizer.normalize_line("let x /* comment */ = 5;"),
Some("let x = 5;".to_string())
);
}
#[test]
fn test_is_in_string_literal_simd() {
let normalizer = LineNormalizer::for_language(Some("rust"));
assert!(!normalizer.is_in_string_literal("let x = 5; // comment", 11));
assert!(normalizer.is_in_string_literal(r#"let s = "hello // world";"#, 15));
assert!(normalizer.is_in_string_literal("let c = '/' + other;", 10));
assert!(!normalizer.is_in_string_literal(r#"let s = "test\""; // comment"#, 20));
assert!(!normalizer.is_in_string_literal("let x = 5;", 0));
let long_line = format!(
r#"let long = "{}"; // comment at position 60"#,
"a".repeat(40)
);
assert!(!normalizer.is_in_string_literal(&long_line, 56));
assert!(normalizer.is_in_string_literal(&long_line, 20));
}
#[test]
fn test_is_license_content() {
assert!(is_license_content("Copyright 2024 Acme Inc"));
assert!(is_license_content("Licensed under the MIT License"));
assert!(is_license_content("SPDX-License-Identifier: Apache-2.0"));
assert!(!is_license_content("fn main() {}"));
assert!(!is_license_content("let x = 5;"));
}
#[test]
fn test_is_test_fixture() {
assert!(is_test_fixture(Path::new("/project/tests/fixtures/data.json")));
assert!(is_test_fixture(Path::new("/project/__fixtures__/mock.js")));
assert!(is_test_fixture(Path::new("/project/testdata/input.txt")));
assert!(!is_test_fixture(Path::new("/project/src/main.rs")));
}
#[test]
fn test_is_generated_file() {
assert!(is_generated_file(
Path::new("proto.pb.go"),
&["// Code generated by protoc-gen-go. DO NOT EDIT.".to_string()]
));
assert!(is_generated_file(
Path::new("types_generated.rs"),
&[]
));
assert!(!is_generated_file(
Path::new("main.rs"),
&["fn main() {}".to_string()]
));
}
#[test]
fn test_detect_exact_clones() {
let temp_dir = TempDir::new().unwrap();
let code = r#"
fn process_data(data: &str) -> Result<String> {
let trimmed = data.trim();
let validated = validate(trimmed)?;
let processed = transform(validated);
let result = format(processed);
Ok(result)
}
fn other_code() {
println!("different");
}
"#;
let file1 = temp_dir.path().join("file1.rs");
let file2 = temp_dir.path().join("file2.rs");
fs::write(&file1, code).unwrap();
fs::write(&file2, code).unwrap();
let config = CloneConfig::default().with_min_lines(4);
let detector = TextualCloneDetector::new(config);
let result = detector.detect_in_files(
temp_dir.path().to_path_buf(),
&[file1, file2]
).unwrap();
assert!(result.stats.clone_groups > 0, "Should detect clone groups");
assert!(result.stats.duplicated_lines > 0, "Should have duplicated lines");
}
#[test]
fn test_detect_no_clones() {
let temp_dir = TempDir::new().unwrap();
let file1 = temp_dir.path().join("file1.rs");
let file2 = temp_dir.path().join("file2.rs");
fs::write(&file1, "fn foo() { println!(\"foo\"); }").unwrap();
fs::write(&file2, "fn bar() { println!(\"bar\"); }").unwrap();
let config = CloneConfig::default().with_min_lines(6);
let detector = TextualCloneDetector::new(config);
let result = detector.detect_in_files(
temp_dir.path().to_path_buf(),
&[file1, file2]
).unwrap();
assert_eq!(result.stats.clone_groups, 0, "Should not detect clones in different code");
}
#[test]
fn test_clone_instance_line_count() {
let instance = CloneInstance::new(PathBuf::from("test.rs"), 10, 15);
assert_eq!(instance.line_count(), 6);
let instance2 = CloneInstance::new(PathBuf::from("test.rs"), 5, 5);
assert_eq!(instance2.line_count(), 1);
}
#[test]
fn test_clone_duplicated_lines() {
let clone = Clone::new_type1(
vec![
CloneInstance::new(PathBuf::from("a.rs"), 1, 6),
CloneInstance::new(PathBuf::from("b.rs"), 1, 6),
CloneInstance::new(PathBuf::from("c.rs"), 1, 6),
],
6,
);
assert_eq!(clone.duplicated_lines(), 12);
}
#[test]
fn test_clone_config_builder() {
let config = CloneConfig::default()
.with_min_lines(10)
.with_language("python")
.with_ignore_patterns(vec!["*.test.py".to_string()]);
assert_eq!(config.min_lines, 10);
assert_eq!(config.language, Some("python".to_string()));
assert_eq!(config.ignore_patterns.len(), 1);
}
#[test]
fn test_hash_line_consistency() {
let line = "let x = foo(bar, baz);";
let h1 = hash_line(line);
let h2 = hash_line(line);
assert_eq!(h1, h2, "Same line should produce same hash");
let h3 = hash_line("let y = bar(foo);");
assert_ne!(h1, h3, "Different lines should produce different hashes");
}
#[test]
fn test_format_clone_summary() {
let analysis = CloneAnalysis {
path: PathBuf::from("./src"),
config: CloneConfig::default(),
clone_groups: vec![Clone::new_type1(
vec![
CloneInstance::with_preview(
PathBuf::from("a.rs"), 1, 6, "fn test()".to_string()
),
CloneInstance::with_preview(
PathBuf::from("b.rs"), 10, 15, "fn test()".to_string()
),
],
6,
)],
stats: CloneStats {
files_scanned: 10,
files_with_clones: 2,
clone_groups: 1,
clone_instances: 2,
duplicated_lines: 6,
total_lines: 100,
duplication_percentage: 6.0,
..Default::default()
},
errors: Vec::new(),
};
let summary = format_clone_summary(&analysis);
assert!(summary.contains("Clone Detection Results"));
assert!(summary.contains("Files scanned: 10"));
assert!(summary.contains("Duplication: 6.0%"));
}
}