mod hash_collision_tests {
use super::*;
#[test]
fn test_winnowing_same_hash_different_text() {
let winnow = Winnowing::new(5, 3);
let texts = [
"the quick brown fox",
"the quick brown dog",
"the quick green fox",
"the slow brown fox",
"a quick brown fox",
];
let fingerprints: Vec<Vec<u64>> = texts.iter().map(|t| winnow.fingerprint(t)).collect();
for fp in &fingerprints {
assert!(!fp.is_empty());
}
}
#[test]
fn test_detector_hash_collision_handling() {
let detector = SimilarityDetector::new(SimilarityConfig {
min_lines: 2,
min_tokens: 3,
..SimilarityConfig::default()
});
let files = vec![
(
PathBuf::from("a.rs"),
"let abc = 1;\nlet xyz = 2;\n".to_string(),
),
(
PathBuf::from("b.rs"),
"let abc = 1;\nlet xyz = 2;\n".to_string(),
), (
PathBuf::from("c.rs"),
"let abc = 1;\nlet uvw = 2;\n".to_string(),
), ];
let duplicates = detector.detect_exact_duplicates(&files);
let _ = duplicates.len();
}
}
mod empty_input_tests {
use super::*;
#[test]
fn test_detector_empty_files_list() {
let detector = SimilarityDetector::new(SimilarityConfig::default());
let files: Vec<(PathBuf, String)> = vec![];
let exact = detector.detect_exact_duplicates(&files);
let structural = detector.detect_structural_similarity(&files, 0.5);
let semantic = detector.detect_semantic_similarity(&files, 0.5);
let entropy = detector.analyze_entropy(&files);
let refactoring = detector.find_refactoring_opportunities(&files);
let comprehensive = detector.comprehensive_analysis(&files);
assert!(exact.is_empty());
assert!(structural.is_empty());
assert!(semantic.is_empty());
assert!((entropy.average_entropy - 0.0).abs() < f64::EPSILON);
assert!(refactoring.is_empty());
assert_eq!(comprehensive.metrics.total_clones, 0);
}
#[test]
fn test_detector_file_with_empty_content() {
let detector = SimilarityDetector::new(SimilarityConfig {
min_lines: 1,
min_tokens: 1,
..SimilarityConfig::default()
});
let files = vec![(PathBuf::from("empty.rs"), String::new())];
let _ = detector.detect_exact_duplicates(&files);
let _ = detector.analyze_entropy(&files);
}
#[test]
fn test_winnowing_empty_string() {
let winnow = Winnowing::new(5, 3);
let fp = winnow.fingerprint("");
assert!(fp.is_empty());
}
#[test]
fn test_winnowing_similarity_both_empty() {
let winnow = Winnowing::new(5, 3);
let sim = winnow.similarity(&[], &[]);
assert!((sim - 0.0).abs() < f64::EPSILON);
}
#[test]
fn test_winnowing_find_matches_empty() {
let winnow = Winnowing::new(5, 3);
let matches = winnow.find_matches(&[], &[]);
assert!(matches.is_empty());
}
#[test]
fn test_detector_files_with_only_whitespace() {
let detector = SimilarityDetector::new(SimilarityConfig {
min_lines: 1,
min_tokens: 1,
..SimilarityConfig::default()
});
let files = vec![
(PathBuf::from("ws1.rs"), " \n\t\n \n".to_string()),
(PathBuf::from("ws2.rs"), "\n\n\n".to_string()),
];
let _ = detector.detect_exact_duplicates(&files);
let _ = detector.analyze_entropy(&files);
}
}
mod integration_tests {
use super::*;
#[test]
fn test_full_workflow_real_rust_code() {
let detector = SimilarityDetector::new(SimilarityConfig {
min_lines: 3,
min_tokens: 10,
similarity_threshold: 0.7,
enable_entropy: true,
enable_ast: true,
enable_semantic: true,
window_size: 10,
k_gram_size: 5,
});
let rust_code1 = r#"
fn calculate_average(numbers: &[i32]) -> f64 {
if numbers.is_empty() {
return 0.0;
}
let sum: i32 = numbers.iter().sum();
sum as f64 / numbers.len() as f64
}
"#;
let rust_code2 = r#"
fn compute_mean(values: &[i32]) -> f64 {
if values.is_empty() {
return 0.0;
}
let total: i32 = values.iter().sum();
total as f64 / values.len() as f64
}
"#;
let files = vec![
(PathBuf::from("file1.rs"), rust_code1.to_string()),
(PathBuf::from("file2.rs"), rust_code2.to_string()),
];
let report = detector.comprehensive_analysis(&files);
assert!(report.entropy_analysis.is_some());
assert!(report.metrics.average_entropy >= 0.0);
assert!(report.metrics.duplication_percentage >= 0.0);
}
#[test]
fn test_winnowing_plagiarism_detection_scenario() {
let winnow = Winnowing::new(10, 5);
let original = "This is an original academic paper discussing the implementation of winnowing algorithms for plagiarism detection. The technique uses fingerprinting to identify similar content.";
let modified = "This paper discusses winnowing algorithms for plagiarism detection. The method uses fingerprinting techniques to find similar content in documents.";
let different = "Machine learning has transformed many industries. Neural networks provide powerful tools for pattern recognition and data analysis.";
let fp_original = winnow.fingerprint(original);
let fp_modified = winnow.fingerprint(modified);
let fp_different = winnow.fingerprint(different);
let sim_orig_mod = winnow.similarity(&fp_original, &fp_modified);
let sim_orig_diff = winnow.similarity(&fp_original, &fp_different);
assert!(sim_orig_mod > sim_orig_diff);
}
#[test]
fn test_multiple_language_support() {
let detector = SimilarityDetector::new(SimilarityConfig {
min_lines: 2,
min_tokens: 5,
..SimilarityConfig::default()
});
let python_code = "def hello():\n print('Hello')\n return True\n";
let javascript_code =
"function hello() {\n console.log('Hello');\n return true;\n}\n";
let rust_code = "fn hello() {\n println!(\"Hello\");\n true\n}\n";
let files = vec![
(PathBuf::from("hello.py"), python_code.to_string()),
(PathBuf::from("hello.js"), javascript_code.to_string()),
(PathBuf::from("hello.rs"), rust_code.to_string()),
];
let report = detector.comprehensive_analysis(&files);
assert!(report.metrics.average_entropy >= 0.0);
}
#[test]
fn test_large_file_handling() {
let detector = SimilarityDetector::new(SimilarityConfig {
min_lines: 6,
min_tokens: 50,
..SimilarityConfig::default()
});
let mut large_content = String::new();
for i in 0..100 {
large_content.push_str(&format!("fn function_{i}() {{\n"));
large_content.push_str(&format!(" let x_{i} = {i};\n"));
large_content.push_str(&format!(" let y_{i} = {i} * 2;\n"));
large_content.push_str(&format!(" println!(\"{{}} {{}}\", x_{i}, y_{i});\n"));
large_content.push_str("}\n\n");
}
let files = vec![(PathBuf::from("large.rs"), large_content)];
let report = detector.comprehensive_analysis(&files);
assert!(report.metrics.average_entropy >= 0.0);
}
}
mod boundary_tests {
use super::*;
#[test]
fn test_similarity_threshold_boundaries() {
let detector = SimilarityDetector::new(SimilarityConfig {
min_lines: 2,
min_tokens: 3,
similarity_threshold: 0.0,
..SimilarityConfig::default()
});
let files = vec![
(
PathBuf::from("a.rs"),
"abc def ghi\nabc def ghi\n".to_string(),
),
(
PathBuf::from("b.rs"),
"xyz uvw rst\nxyz uvw rst\n".to_string(),
),
];
let _ = detector.detect_structural_similarity(&files, 0.0);
let _ = detector.detect_structural_similarity(&files, 1.0);
}
#[test]
fn test_min_lines_boundary() {
let detector = SimilarityDetector::new(SimilarityConfig {
min_lines: 1,
min_tokens: 1,
..SimilarityConfig::default()
});
let files = vec![(
PathBuf::from("single.rs"),
"single line content".to_string(),
)];
let _ = detector.detect_exact_duplicates(&files);
}
#[test]
fn test_k_gram_larger_than_text() {
let winnow = Winnowing::new(100, 100);
let fp = winnow.fingerprint("short");
assert!(fp.is_empty());
}
#[test]
fn test_window_larger_than_k_grams() {
let winnow = Winnowing::new(1000, 5);
let fp = winnow.fingerprint("hello world test string");
assert!(fp.len() <= 100);
}
}