use std::path::Path;
use anyhow::{Result, Context};
use image::{GenericImageView, DynamicImage, imageops::FilterType, Pixel};
use std::collections::HashMap;
use std::f32;
use crate::cli::ImageHashAlgorithm;
pub fn levenshtein(a: &str, b: &str) -> usize {
let mut costs = vec![0; b.len() + 1];
for j in 0..=b.len() {
costs[j] = j;
}
for (i, ca) in a.chars().enumerate() {
let mut last = i;
costs[0] = i + 1;
for (j, cb) in b.chars().enumerate() {
let old = costs[j + 1];
costs[j + 1] = if ca == cb {
last
} else {
1 + last.min(costs[j]).min(costs[j + 1])
};
last = old;
}
}
costs[b.len()]
}
pub fn compare_text_files(path1: &Path, path2: &Path) -> Result<usize> {
let text1 = std::fs::read_to_string(path1)?;
let text2 = std::fs::read_to_string(path2)?;
Ok(levenshtein(&text1, &text2))
}
pub fn text_similarity(path1: &Path, path2: &Path) -> Result<f32> {
let distance = compare_text_files(path1, path2)? as f32;
let max_len = std::cmp::max(
std::fs::read_to_string(path1)?.len(),
std::fs::read_to_string(path2)?.len(),
) as f32;
if max_len == 0.0 {
return Ok(1.0); }
Ok(1.0 - (distance / max_len).min(1.0))
}
pub fn group_similar_text_files(
files: &[String],
similarity_threshold: f32,
) -> Result<Vec<Vec<String>>> {
if files.is_empty() {
return Ok(Vec::new());
}
let mut size_groups: HashMap<u64, Vec<String>> = HashMap::new();
for file in files {
if let Ok(metadata) = std::fs::metadata(file) {
size_groups.entry(metadata.len()).or_default().push(file.clone());
}
}
let mut groups: Vec<Vec<String>> = Vec::new();
let mut processed = std::collections::HashSet::new();
for file in files {
if processed.contains(file) {
continue;
}
let mut current_group = vec![file.clone()];
processed.insert(file.clone());
if let Ok(metadata) = std::fs::metadata(file) {
if let Some(similar_sized) = size_groups.get(&metadata.len()) {
for other_file in similar_sized {
if processed.contains(other_file) || file == other_file {
continue;
}
if let Ok(similarity) = text_similarity(
Path::new(file),
Path::new(other_file),
) {
if similarity >= similarity_threshold {
current_group.push(other_file.clone());
processed.insert(other_file.clone());
}
}
}
}
}
if current_group.len() > 1 {
groups.push(current_group);
}
}
Ok(groups)
}
#[derive(Debug, Clone)]
pub struct ImageSignature {
pub avg_hash: u64,
pub phash: u64,
pub dhash: u64,
pub color_hash: u64,
}
pub fn generate_image_signature(path: &Path) -> Result<ImageSignature> {
let img = image::open(path)
.with_context(|| format!("Failed to open image: {}", path.display()))?;
let gray_img = img.grayscale();
let ((avg_hash, phash), (dhash, color_hash)) = rayon::join(
|| {
rayon::join(
|| average_hash(&gray_img).unwrap_or_else(|_| 0),
|| perceptual_hash(&gray_img).unwrap_or_else(|_| 0),
)
},
|| {
rayon::join(
|| difference_hash(&gray_img).unwrap_or_else(|_| 0),
|| color_hash(&img).unwrap_or_else(|_| 0),
)
},
);
Ok(ImageSignature {
avg_hash,
phash,
dhash,
color_hash,
})
}
pub fn compare_image_signatures(sig1: &ImageSignature, sig2: &ImageSignature) -> f32 {
let weights = [0.3, 0.4, 0.2, 0.1];
let scores = [
hamming_similarity(sig1.avg_hash, sig2.avg_hash),
hamming_similarity(sig1.phash, sig2.phash),
hamming_similarity(sig1.dhash, sig2.dhash),
hamming_similarity(sig1.color_hash, sig2.color_hash),
];
scores.iter()
.zip(weights.iter())
.map(|(&score, &weight)| score * weight)
.sum()
}
pub fn average_hash(img: &DynamicImage) -> Result<u64> {
let img = img.resize_exact(8, 8, FilterType::Lanczos3);
let mut total = 0u32;
let mut pixels = [0u8; 64];
for (i, p) in img.pixels().enumerate() {
let luma = p.2[0];
pixels[i] = luma;
total += luma as u32;
}
let avg = total / 64;
let mut hash = 0u64;
for (i, &luma) in pixels.iter().enumerate() {
if luma as u32 >= avg {
hash |= 1 << i;
}
}
Ok(hash)
}
pub fn perceptual_hash(img: &DynamicImage) -> Result<u64> {
let img = img.resize_exact(32, 32, FilterType::Lanczos3).to_luma8();
let mut pixels = vec![0.0; 32 * 32];
for (i, (_, _, pixel)) in img.enumerate_pixels().enumerate() {
pixels[i] = pixel[0] as f64 / 255.0;
}
let dct = apply_dct_2d(&pixels, 32);
let mut dct_values = Vec::with_capacity(64);
for i in 0..8 {
for j in 0..8 {
if i == 0 && j == 0 { continue; } dct_values.push(dct[i * 32 + j]);
}
}
dct_values.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let median = if !dct_values.is_empty() {
dct_values[dct_values.len() / 2]
} else {
return Err(anyhow::anyhow!("Failed to calculate median for perceptual hash"));
};
let mut hash = 0u64;
for (i, &val) in dct_values.iter().enumerate() {
if val > median && i < 64 {
hash |= 1u64 << i;
}
}
Ok(hash)
}
fn difference_hash(img: &DynamicImage) -> Result<u64> {
let img = img.resize_exact(9, 8, FilterType::Lanczos3);
let mut hash = 0u64;
for y in 0..8 {
for x in 0..8 {
let left = img.get_pixel(x, y).to_luma()[0];
let right = img.get_pixel(x + 1, y).to_luma()[0];
if left > right {
hash |= 1 << (y * 8 + x);
}
}
}
Ok(hash)
}
fn color_hash(img: &DynamicImage) -> Result<u64> {
let img = img.resize_exact(8, 8, FilterType::Lanczos3);
let mut hash = 0u64;
for (_i, (_x, _y, pixel)) in img.pixels().enumerate() {
let r = pixel[0] as u32;
let g = pixel[1] as u32;
let b = pixel[2] as u32;
let rq = (r / 64) as u8;
let gq = (g / 64) as u8;
let bq = (b / 64) as u8;
let color_byte = (rq << 4) | (gq << 2) | bq;
hash = hash.wrapping_mul(31).wrapping_add(color_byte as u64);
}
Ok(hash)
}
fn apply_dct_2d(matrix: &[f64], size: usize) -> Vec<f64> {
let mut output = vec![0.0; size * size];
let c1 = std::f64::consts::PI / (size as f64);
for u in 0..size {
for v in 0..size {
let cu = if u == 0 { 1.0 / 2.0f64.sqrt() } else { 1.0 };
let cv = if v == 0 { 1.0 / 2.0f64.sqrt() } else { 1.0 };
let mut sum = 0.0;
for x in 0..size {
for y in 0..size {
let cos1 = (c1 * (x as f64 + 0.5) * u as f64).cos();
let cos2 = (c1 * (y as f64 + 0.5) * v as f64).cos();
sum += matrix[x * size + y] * cos1 * cos2;
}
}
output[u * size + v] = 0.25 * cu * cv * sum;
}
}
output
}
#[allow(dead_code)]
pub fn compare_images(path1: &Path, path2: &Path) -> Result<f32> {
let sig1 = generate_image_signature(path1)?;
let sig2 = generate_image_signature(path2)?;
Ok(compare_image_signatures(&sig1, &sig2))
}
pub fn hamming_distance(a: u64, b: u64) -> u32 {
(a ^ b).count_ones()
}
pub fn hamming_similarity(a: u64, b: u64) -> f32 {
let distance = hamming_distance(a, b) as f32;
1.0 - (distance / 64.0).min(1.0)
}
pub fn compare_images_with_algorithm(
path1: &str,
path2: &str,
algorithm: ImageHashAlgorithm,
) -> Option<f32> {
let img1 = match image::open(path1) {
Ok(img) => img,
Err(_) => return None,
};
let img2 = match image::open(path2) {
Ok(img) => img,
Err(_) => return None,
};
match algorithm {
ImageHashAlgorithm::Avg => {
let hash1 = average_hash(&img1).ok()?;
let hash2 = average_hash(&img2).ok()?;
Some(hamming_similarity(hash1, hash2))
}
ImageHashAlgorithm::Phash => {
let hash1 = perceptual_hash(&img1).ok()?;
let hash2 = perceptual_hash(&img2).ok()?;
Some(hamming_similarity(hash1, hash2))
}
ImageHashAlgorithm::Dhash => {
let hash1 = difference_hash(&img1).ok()?;
let hash2 = difference_hash(&img2).ok()?;
Some(hamming_similarity(hash1, hash2))
}
ImageHashAlgorithm::Color => {
let hash1 = color_hash(&img1).ok()?;
let hash2 = color_hash(&img2).ok()?;
Some(hamming_similarity(hash1, hash2))
}
ImageHashAlgorithm::Combined => {
let sig1 = generate_image_signature(Path::new(path1)).ok()?;
let sig2 = generate_image_signature(Path::new(path2)).ok()?;
Some(compare_image_signatures(&sig1, &sig2))
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::NamedTempFile;
fn create_test_file(content: &str) -> NamedTempFile {
let mut file = NamedTempFile::new().unwrap();
file.write_all(content.as_bytes()).unwrap();
file
}
#[test]
fn test_text_similarity_identical() {
let file1 = create_test_file("This is a test file with some content.");
let file2 = create_test_file("This is a test file with some content.");
let similarity = text_similarity(&file1.path(), &file2.path()).unwrap();
assert!((similarity - 1.0).abs() < f32::EPSILON);
}
#[test]
fn test_text_similarity_different() {
let file1 = create_test_file("This is a test file with some content.");
let file2 = create_test_file("This is a completely different file with different content.");
let similarity = text_similarity(&file1.path(), &file2.path()).unwrap();
assert!(similarity < 0.5);
}
#[test]
fn test_text_similarity_empty() {
let file1 = create_test_file("");
let file2 = create_test_file("");
let similarity = text_similarity(&file1.path(), &file2.path()).unwrap();
assert!((similarity - 1.0).abs() < f32::EPSILON);
}
#[test]
fn test_group_similar_text_files() {
let file1 = create_test_file("This is a test file with some content.");
let file2 = create_test_file("This is a test file with some content."); let file3 = create_test_file("This is a test file with slightly different content."); let file4 = create_test_file("This is completely different content.");
let files = vec![
file1.path().to_str().unwrap().to_string(),
file2.path().to_str().unwrap().to_string(),
file3.path().to_str().unwrap().to_string(),
file4.path().to_str().unwrap().to_string(),
];
let groups = group_similar_text_files(&files, 0.9).unwrap();
assert_eq!(groups.len(), 1);
assert_eq!(groups[0].len(), 2);
let groups = group_similar_text_files(&files, 0.7).unwrap();
assert_eq!(groups.len(), 1);
assert!(groups[0].len() >= 3);
let groups = group_similar_text_files(&files, 0.3).unwrap();
assert!(!groups.is_empty());
}
#[test]
fn test_levenshtein_distance() {
assert_eq!(levenshtein("kitten", "sitting"), 3);
assert_eq!(levenshtein("book", "back"), 2);
assert_eq!(levenshtein("", "test"), 4);
assert_eq!(levenshtein("test", ""), 4);
assert_eq!(levenshtein("", ""), 0);
}
#[test]
fn test_levenshtein() {
assert_eq!(levenshtein("kitten", "sitting"), 3);
assert_eq!(levenshtein("flaw", "lawn"), 2);
assert_eq!(levenshtein("", ""), 0);
assert_eq!(levenshtein("a", "a"), 0);
assert_eq!(levenshtein("abc", ""), 3);
assert_eq!(levenshtein("", "abc"), 3);
}
#[test]
fn test_hamming_distance() {
assert_eq!(hamming_distance(0b1010, 0b1111), 2);
assert_eq!(hamming_distance(0, 0), 0);
assert_eq!(hamming_distance(u64::MAX, 0), 64);
}
#[test]
fn test_hamming_similarity() {
assert!((hamming_similarity(0b1010, 0b1111) - 0.96875).abs() < f32::EPSILON);
assert!((hamming_similarity(0, 0) - 1.0).abs() < f32::EPSILON);
assert!((hamming_similarity(u64::MAX, 0) - 0.0).abs() < f32::EPSILON);
}
#[test]
fn test_compare_images_same() -> Result<()> {
let mut img = image::GrayImage::new(100, 100);
for (x, y, pixel) in img.enumerate_pixels_mut() {
*pixel = image::Luma([(x + y) as u8]);
}
let mut file1 = NamedTempFile::new()?;
let path1 = file1.path().to_owned();
img.save(&path1)?;
let similarity = compare_images(&path1, &path1)?;
assert!((similarity - 1.0).abs() < f32::EPSILON, "Image should be identical to itself");
Ok(())
}
#[test]
fn test_compare_images_different() -> Result<()> {
let mut img1 = image::GrayImage::new(100, 100);
let mut img2 = image::GrayImage::new(100, 100);
for (x, y, pixel) in img1.enumerate_pixels_mut() {
*pixel = image::Luma([(x + y) as u8]);
}
for (x, y, pixel) in img2.enumerate_pixels_mut() {
*pixel = image::Luma([(x * 2 + y) as u8]);
}
let mut file1 = NamedTempFile::new()?;
let mut file2 = NamedTempFile::new()?;
let path1 = file1.path().to_owned();
let path2 = file2.path().to_owned();
img1.save(&path1)?;
img2.save(&path2)?;
let similarity = compare_images(&path1, &path2)?;
assert!(similarity < 0.5, "Different images should have low similarity");
Ok(())
}
#[test]
fn test_average_hash() -> Result<()> {
let img = image::GrayImage::from_pixel(8, 8, image::Luma([128u8]));
let hash = average_hash(&DynamicImage::ImageLuma8(img))?;
assert!(hash == 0 || hash == u64::MAX);
Ok(())
}
#[test]
fn test_perceptual_hash() -> Result<()> {
let img = image::GrayImage::from_pixel(32, 32, image::Luma([128u8]));
let hash = perceptual_hash(&DynamicImage::ImageLuma8(img))?;
assert_ne!(hash, 0);
assert_ne!(hash, u64::MAX);
let mut img2 = image::GrayImage::new(32, 32);
for (x, y, pixel) in img2.enumerate_pixels_mut() {
*pixel = image::Luma([(x + y) as u8]);
}
let hash2 = perceptual_hash(&DynamicImage::ImageLuma8(img2))?;
assert_ne!(hash, hash2);
Ok(())
}
#[test]
fn test_difference_hash() -> Result<()> {
let img = image::GrayImage::from_pixel(9, 8, image::Luma([128u8]));
let hash = difference_hash(&DynamicImage::ImageLuma8(img))?;
assert!(hash == 0 || hash == u64::MAX);
let mut img2 = image::GrayImage::new(9, 8);
for (x, y, pixel) in img2.enumerate_pixels_mut() {
*pixel = image::Luma([(x + y) as u8]);
}
let hash2 = difference_hash(&DynamicImage::ImageLuma8(img2))?;
let uniform_hash = difference_hash(&DynamicImage::ImageLuma8(
image::GrayImage::from_pixel(9, 8, image::Luma([128u8]))
))?;
assert_ne!(hash2, uniform_hash);
Ok(())
}
#[test]
fn test_color_hash() -> Result<()> {
let img = image::RgbImage::from_pixel(8, 8, image::Rgb([255, 0, 0]));
let hash = color_hash(&DynamicImage::ImageRgb8(img))?;
let img2 = image::RgbImage::from_pixel(8, 8, image::Rgb([0, 255, 0]));
let hash2 = color_hash(&DynamicImage::ImageRgb8(img2))?;
assert_ne!(hash, hash2);
let img3 = image::RgbImage::from_pixel(8, 8, image::Rgb([255, 0, 0]));
let hash3 = color_hash(&DynamicImage::ImageRgb8(img3))?;
assert_eq!(hash, hash3);
Ok(())
}
#[test]
fn test_image_signature() -> Result<()> {
let mut img = image::RgbImage::new(32, 32);
for (x, y, pixel) in img.enumerate_pixels_mut() {
*pixel = image::Rgb([(x + y) as u8, x as u8, y as u8]);
}
let mut file = NamedTempFile::new()?;
let path = file.path().to_owned();
DynamicImage::ImageRgb8(img).save(&path)?;
let sig = generate_image_signature(&path)?;
let sig2 = generate_image_signature(&path)?;
assert_eq!(sig.avg_hash, sig2.avg_hash);
assert_eq!(sig.phash, sig2.phash);
assert_eq!(sig.dhash, sig2.dhash);
assert_eq!(sig.color_hash, sig2.color_hash);
let similarity = compare_image_signatures(&sig, &sig2);
assert!((similarity - 1.0).abs() < f32::EPSILON);
let mut img2 = image::RgbImage::new(32, 32);
for (x, y, pixel) in img2.enumerate_pixels_mut() {
*pixel = image::Rgb([(x * 2 + y) as u8, (y * 2) as u8, x as u8]);
}
let mut file2 = NamedTempFile::new()?;
let path2 = file2.path().to_owned();
DynamicImage::ImageRgb8(img2).save(&path2)?;
let sig3 = generate_image_signature(&path2)?;
let similarity = compare_image_signatures(&sig, &sig3);
assert!(similarity < 0.9, "Different images should have similarity < 0.9");
Ok(())
}
#[test]
fn test_apply_dct_2d() {
let input = vec![1.0, 2.0, 3.0, 4.0,
5.0, 6.0, 7.0, 8.0,
9.0, 10.0, 11.0, 12.0,
13.0, 14.0, 15.0, 16.0];
let output = apply_dct_2d(&input, 4);
assert_eq!(output.len(), 16);
let avg = input.iter().sum::<f64>() / 16.0;
assert!((output[0] - avg * 4.0).abs() < 1e-10);
let const_input = vec![1.0; 16];
let const_output = apply_dct_2d(&const_input, 4);
assert!((const_output[0] - 16.0).abs() < 1e-10);
for &coeff in &const_output[1..] {
assert!(coeff.abs() < 1e-10, "AC coefficient should be zero for constant input");
}
}
}