photodedupe 1.0.6

Utility for identifying duplicate photos
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
extern crate image;

use std::cmp::Ordering;
use image::{GenericImageView, DynamicImage};
use image::imageops::FilterType;
use image::ImageReader;
use std::fs;

use crate::image_error::MyImageError;

#[derive(Clone)]
pub struct ImagePath {
	/// The path to a valid image file
	pub fpath: String,
	/// True if the image exists in the new images comparison dir (false if feature not in use or else in existing collection)	
	pub is_compare_dir : bool,
	/// True if when using --compare a duplicate should always be marked even if a better quality than the existing image
	pub always_mark_dupe_compare : bool,
}

/// Statistics about an image that are used to perform the deduplication
pub struct ImageHashAV {
	/// A common key to group potential duplicates - same integer means possible (but not yet confirmed) dupe
	pub dupe_group : u64 ,
	/// A perceptual hash code of a greyscale low resolution version of the image
	pub grey_hash : u64,
	/// The pixels of a colour low resolution version of the image
	pub low_res : [u8;192],
	/// Width of the original image in pixels
	pub width: u32,
	/// Height of the original image in pixels
	pub height: u32,
	/// File size in bytes
	pub file_size : u64,
	/// Total number of pixels in the original image
	pub num_pixels: u64,
	/// Standard deviation of colour values from the mean (used to avoid testing images with low variation)
	pub std_dev : f32,
	/// The path to the image
	pub image_path: ImagePath,
}

/// Holds the configuration options that are set on the command line
pub struct ConfigOptions {
	/// Controls how likely the system is to determine an image is a duplicate
	pub colour_difference_threshold : u64,
	/// Controls a threshold below which it will declare images unique that the system can't work with (e.g. very dark images)
	pub std_dev_threshold : f32,			
	/// The number of images at which a faster algorithm is used
	pub alg_flip_threshold : u64,
	/// Force use of the more computationally expensive but more accurate algorithm
	pub alg_colour_diff_only : bool,
	/// Only consider known image file extensions e.g. .jpg .png etc
	pub only_known_file_extensions : bool,
	/// Option to only list the duplicates found and not the best versions of each image
	pub only_list_duplicates : bool,
	/// Option to only list the uniques images found and not the duplicates
	pub only_list_uniques : bool,
	/// Whether to output all images found as opposed to just those with duplicates
	pub list_all : bool,
	/// How many threads to use to process images
	pub num_threads : u32,
	/// The path to the comparison directory			
	pub compare_dir : String,
	/// If the --compare option is used
	pub am_comparing : bool,
	/// If the --always-mark-duplicates option is used
	pub always_mark_duplicates : bool,
	/// The minimum accepted image width
	pub min_width : u32,
	/// The minimum accepted image height
	pub min_height : u32,
}


/// Describes the sort order for ImageHashAV objects
/// Order the images with the following keys
/// 1st) The dupe_group (ascending)
/// 2nd) If the comparison image is in the --compare directory, sort further down the list if the --always-mark-duplicates option is set
/// 3rd) The total number of pixels (descending) - prefers higher resolution images as better quality
/// 4th) The file size (descending) - prefers larger images as better quality where they are the same resolution
/// 5th) Where --compare is used, prefers the image in the original collection and the new image will be the duplicate 
impl Ord for ImageHashAV {
	
    fn cmp(&self, other: &Self) -> Ordering {

	//Sort images into groups of duplicates first
	if self.dupe_group < other.dupe_group{
		return Ordering::Less;
	}
	if self.dupe_group > other.dupe_group{
		return Ordering::Greater;
	}

	//If the comparison image is in the --compare directory, sort further down the list if the --always-mark-duplicates option is set
	//This makes it a duplicate prior to checking if it's better resolution
	if self.image_path.always_mark_dupe_compare && self.image_path.is_compare_dir && (!other.image_path.always_mark_dupe_compare) {
		return Ordering::Greater;
	}

	if (!self.image_path.always_mark_dupe_compare) && other.image_path.always_mark_dupe_compare && other.image_path.is_compare_dir {
		return Ordering::Less;
	}
	
	//Push files with greater number of pixels further up the list
	if self.num_pixels > other.num_pixels{
		return Ordering::Less;
	}
	
	if self.num_pixels < other.num_pixels{
		return Ordering::Greater;
	}
	
	//Push larger file sizes further up the list
	if self.file_size > other.file_size {
		return Ordering::Less;
	}
	
	if self.file_size < other.file_size {
		return Ordering::Greater;
	}

	//Where --compare is used, prefers the image in the original collection and the new image will be the duplicate
	if self.image_path.is_compare_dir && (!other.image_path.is_compare_dir) {
		return Ordering::Greater;
	}

	if (!self.image_path.is_compare_dir) && other.image_path.is_compare_dir {
		return Ordering::Less;
	}
	
	return Ordering::Equal
    }
    
}

impl Eq for ImageHashAV {}

impl PartialOrd for ImageHashAV {
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
        Some(self.cmp(other))
    }
}

impl PartialEq for ImageHashAV {
    fn eq(&self, other: &Self) -> bool {
        if (self.dupe_group == other.dupe_group) && 
           ( self.num_pixels == other.num_pixels ) && 
           ( self.file_size == other.file_size ) {
			return true;
		}
		
		return false;
    }
}

///Open an image from the specific path. Tries to guess the format if it's not known.
fn load_image_from_file( image_path: &str  ) -> std::result::Result<DynamicImage, MyImageError> {
	
	
	let img = match ImageReader::open(image_path) {
		Ok(image) => image,
		Err(_) => {
			return Err(MyImageError::FileError(format!("Error: Failed to read image file: {}", image_path).to_string()));
		},
	};
	
	let format_guessed = match img.with_guessed_format() {
		Ok( format_guessed ) => format_guessed,
		Err(_) => {
				return Err(MyImageError::DecodeFail(format!("Error: Failed to identify image file format {}", image_path).to_string()));
		}
	};
	
	let decoded_img = match format_guessed.decode() {
		Ok( decoded_img ) => decoded_img,
		Err(_) => {
				return Err( MyImageError::DecodeFail(format!("Error: Failed to correctly decode image: {}", image_path).to_string()) );
		}
	};
	
	return Ok(decoded_img);
}



impl ImageHashAV {
	
	/// Default colour difference threshold under which two images are declared dupes
	pub const DEFAULT_COLOUR_DIFF_THRESHOLD: u64 = 256;	
	/// Default colour variation threshold under which de-duplication is not attempted
	pub const DEFAULT_STD_DEV_THRESHOLD : f32 = 3.0;
	/// Number of files at which we flip to the less accurate but faster algorithm
	pub const DEFAULT_ALG_FLIP_THRESHOLD : u64 = 50000;
		
	pub fn new(fpath : &ImagePath, min_width: u32, min_height : u32) -> Result<ImageHashAV,MyImageError> {
		let mut object = ImageHashAV {	dupe_group: 0, grey_hash: 0, low_res: [0;192], 
						width: 0, height: 0, num_pixels: 0, std_dev: 0f32, 
						file_size: 0, image_path : ImagePath { fpath: "".to_string(), is_compare_dir: false, always_mark_dupe_compare: false } };
		match object.calc_image_hash( &fpath,  min_width, min_height ) {
			Some(e) => return Err(e),
			None => return Ok(object),
		}
	}
	
	/// Check if two image aspect ratios are within 2% of each other
	pub fn has_similar_aspect_ratio( &self, comp: &ImageHashAV ) -> bool {
		let aspect_ratio_a : f32 = self.width as f32 / self.height as f32;
		let aspect_ratio_b : f32 = comp.width as f32 / comp.height as f32;
		
		let aspect_ratio_a_high = aspect_ratio_a * 1.02;
		let aspect_ratio_a_low = aspect_ratio_a - (aspect_ratio_a * 0.02);
		
		if aspect_ratio_b <= aspect_ratio_a_high && aspect_ratio_b >= aspect_ratio_a_low {
			return true;
		}
		
		return false;
	}
	
	/// Difference between the low_res version of this and another imagehash
	pub fn diff_colour( &self, comp: &ImageHashAV ) -> u64{
		
		let mut diff: u64 = 0;
		
		for i in 0..64 {
			let rdiff : u32 = (comp.low_res[i*3] as i32 - self.low_res[i*3] as i32).abs() as u32;
			let gdiff : u32 = (comp.low_res[(i*3)+1] as i32 - self.low_res[(i*3)+1] as i32).abs() as u32;
			let bdiff : u32 = (comp.low_res[(i*3)+2] as i32 - self.low_res[(i*3)+2] as i32).abs() as u32;
			
			diff += ( rdiff + gdiff + bdiff ) as u64;
		}
		
		return diff;
		
	}
	
	/// Test if two images are duplicates of each other by looking at the comparitive variance in the colours
	pub fn is_dupe ( &self, other : &ImageHashAV, config: &ConfigOptions ) -> bool {

		//Excludes dark images with little variation which are difficult to dedupe correctly
		if self.std_dev > config.std_dev_threshold && other.std_dev > config.std_dev_threshold {	
			//Checks the images have a similar aspect ratio	
			if self.has_similar_aspect_ratio( &other ) {
				//Checks the colour differences are similar
				if self.diff_colour( &other ) <= config.colour_difference_threshold {
					return true;
				}
			}
		}
		
		return false;
	}
	
	/// For each colour channel calculate the stdv of the pixels values and then take the average of the colour channels
	pub fn calc_std_dev_colour_hash ( &mut self ) {
		
		let mut r_pixel_av : f32 = 0.0;
		let mut g_pixel_av : f32 = 0.0;
		let mut b_pixel_av : f32 = 0.0;
		let mut r_square_total : f32 = 0.0;
		let mut g_square_total : f32 = 0.0;
		let mut b_square_total : f32 = 0.0;
		
		for i in 0..64 {
			r_pixel_av += self.low_res[i*3] as f32;
			g_pixel_av += self.low_res[(i*3)+1] as f32;
			b_pixel_av += self.low_res[(i*3)+2] as f32;
		}
		r_pixel_av /= 64.0;
		g_pixel_av /= 64.0;
		b_pixel_av /= 64.0;
		
		for i in 0..64 {
			r_square_total += ( (self.low_res[i*3] as f32) - r_pixel_av ).powf(2f32);
			g_square_total += ( (self.low_res[(i*3)+1] as f32) - g_pixel_av ).powf(2f32);
			b_square_total += ( (self.low_res[(i*3)+2] as f32) - b_pixel_av ).powf(2f32);
		}
		r_square_total /= 64.0;
		g_square_total /= 64.0;
		b_square_total /= 64.0;
		
		//Return average std_dev in the colours
		self.std_dev = (r_square_total.sqrt() + g_square_total.sqrt() + b_square_total.sqrt())/3.0;
		
	}
	
	/// Populates image statistics including the perceptual hash
	pub fn calc_image_hash(&mut self, im_path: &ImagePath, min_width: u32, min_height : u32 ) -> Option<MyImageError> {
		   
		match load_image_from_file( &im_path.fpath ) {
			Ok(img) => {
			
				//Ignore very small images that the technique can't work with and also images below the user configured size
				let (width, height) = img.dimensions();
				if width < 16 || height < 16 {
					return Some( MyImageError::ImageTooSmall(format!("Warning: Image too small to deduplicate: {}", im_path.fpath).to_string()) );
				}
				
				if min_width > 0 && min_height > 0 {
					if width < min_width || height < min_height {
						return Some( MyImageError::ImageTooSmall(format!("Warning: Ignored image because dimensions ({},{}) are below minimum: {}",width,height, im_path.fpath).to_string()) );
					}
				}
		
				self.width = width;
				self.height = height;
				self.num_pixels = (width as u64)*(height as u64);
				self.image_path = im_path.clone();		

				//Get the file size as a tie breaker if image dimensions are the same
				match fs::metadata(im_path.fpath.clone()) {
					Ok(md)=> {
						self.file_size = md.len();
					}
					Err(_)=> {
						return Some(MyImageError::FileError(format!("Error: Failed to get size of: {}", im_path.fpath).to_string()));
					}
				}
		
								
				//Seems to work best with Gaussian, although it's the slowest
				let scaled = img.resize_exact(8,8,FilterType::Gaussian);
		
				let (width, height) = scaled.dimensions();
				if width != 8 || height != 8 {
					return Some( MyImageError::DecodeFail(format!("Error: Failed to resize image correctly: {}", im_path.fpath).to_string()) );
				}

				let gs = scaled.grayscale( );
		
				let mut num_pixels = 0;
				let mut total: u64 = 0;
				for pixel in gs.pixels() {
					let p: u64 = ((pixel.2).0)[0].into();
					total += p;
					num_pixels+=1;
				}
				let average: f32 = (total as f32)/ (num_pixels as f32);
		
				let mut hash_val: u64 = 0;
				let mut this_bit: u64 = 0;
		
				for pixel in gs.pixels() {
					let p: f32 = ((pixel.2).0)[0].into();
					if p >= average {
						hash_val = (((1 as u64) << this_bit ) as u64) | hash_val;
					}
					this_bit+=1;
				}				
		
				//Add the pixels of the low res original image into the struct
				let mut pnum : usize = 0;
				for pixel in scaled.pixels() {
					self.low_res[pnum*3] = ((pixel.2)[0]).into();
					self.low_res[(pnum*3)+1] = ((pixel.2)[1]).into();
					self.low_res[(pnum*3)+2] = ((pixel.2)[2]).into();
					pnum+=1;
				}
		
				self.dupe_group = hash_val;
				self.grey_hash = hash_val;
				self.calc_std_dev_colour_hash();

				return None;
			},
			Err(e) => {
				return Some(e);
			}	
		}
	}

}

#[cfg(test)]
mod tests {
	extern crate glob;
	use super::*;
	use glob::glob;

	/// Helper function to report the number of bits the same between two 64 bit values
    	fn calc_hamming_distance( a: u64, b: u64) -> u8 {
		let mut bits_similar : u8 = 0;
		for i in 0..64 {
			if (a & (1u64 << i)) == (b & (1u64 << i)) {
				bits_similar+=1;
			}
		}
		return bits_similar;
	}

	/// Test an image is read and metadata extracted correctly
	#[test]
	fn test_image_read() {
		let result = ImageHashAV::new( &ImagePath { fpath: "unit_test_images/bridge1_best.jpg".to_string(), is_compare_dir:false, always_mark_dupe_compare: false },0,0 ).unwrap();
		assert_eq!(768,result.width,"Width OK");
		assert_eq!(576,result.height,"Height OK");
		assert_eq!(576*768,result.num_pixels,"NUm pixels OK");
	}
    
	/// Test that images that should be duplicates of each other are correctly identified
	#[test]
	fn test_image_duplicates() {
		let mut image_paths = Vec::new();
	
		//Get a list of all the images in the unit_test_images directory.
		for entry in glob("unit_test_images/*").expect("Failed to read glob pattern") {
			let path = entry.unwrap().display().to_string();
			//Only list files that contain best and duplicate
			if path.contains("_best.") || path.contains("_duplicate_") {
				image_paths.push( path );
			}
		
		}
		//Sort the list such that each image with the name "best" should now be followed by exactly 2 duplicates.
		image_paths.sort();
		assert!(image_paths.len() >=3 , "3 or more image paths");
		assert!(image_paths.len() % 3 == 0, "Image paths divides by 3 (best + 2 duplicates per image)");
	
		//Check the best image matches the two duplicates
		for i in 0..(image_paths.len()/3) {
			let result = ImageHashAV::new( &ImagePath { fpath: image_paths[i*3].clone(), is_compare_dir:false, always_mark_dupe_compare: false },0,0 ).unwrap();
			let dupe1 = ImageHashAV::new( &ImagePath { fpath:  image_paths[(i*3)+1].clone(), is_compare_dir:false, always_mark_dupe_compare: false },0,0 ).unwrap();
			let dupe2 = ImageHashAV::new( &ImagePath { fpath:  image_paths[(i*3)+2].clone(), is_compare_dir:false, always_mark_dupe_compare: false },0,0 ).unwrap();
		
			//Check the duplicates match the best versions within a hamming distance of 1 bit (max 64 bits can be similar)
			assert!( calc_hamming_distance(result.dupe_group, dupe1.dupe_group) >= 63, "First duplicate grey hash matches" );
			assert!( calc_hamming_distance(result.dupe_group, dupe2.dupe_group) >= 63, "Second duplicate grey hash matches" );
		
			assert!( result.diff_colour( &dupe1 ) <= ImageHashAV::DEFAULT_COLOUR_DIFF_THRESHOLD, "First duplicate colours are similar" );
			assert!( result.diff_colour( &dupe2 ) <= ImageHashAV::DEFAULT_COLOUR_DIFF_THRESHOLD, "Second duplicate colours are similar" );
		}			
	}
    
	/// Test that images which should not be duplicates of each other do not match
	#[test]
	fn test_image_uniques() {
		let mut image_paths = Vec::new();
		let mut image_hashes = Vec::new();
	
		//Get a list of images that should be unique
		for entry in glob("unit_test_images/*").expect("Failed to read glob pattern") {
			let path = entry.unwrap().display().to_string();
			if path.contains("_best.") {
				image_paths.push( path );
			}
		}
	
		for path in &image_paths {
			let result = ImageHashAV::new( &ImagePath { fpath:  path.to_string(), is_compare_dir:false, always_mark_dupe_compare: false },0,0 ).unwrap();
			image_hashes.push( result );
		}
		
		//Test every image that should be unique against every other and match sure none match
		for i in 0..image_hashes.len() {			
			for j in (i+1)..image_hashes.len() {
				//Checks that either the images are at least 1 bit different on the grey hash
				//or else the colours do not match
				assert!( (calc_hamming_distance(image_hashes[j].dupe_group, image_hashes[i].dupe_group) <= 63) || 
						(image_hashes[j].diff_colour( &image_hashes[i] ) > ImageHashAV::DEFAULT_COLOUR_DIFF_THRESHOLD),
						"Images that should be unique don't match");
			}
		}
	}
}