Skip to main content

immich_lib/
scoring.rs

1//! Metadata scoring and duplicate analysis.
2//!
3//! This module provides scoring algorithms for ranking assets by metadata completeness
4//! and detecting conflicts between duplicate assets.
5
6use serde::{Deserialize, Serialize};
7
8use crate::models::{AssetResponse, DuplicateGroup};
9
10/// Weight values for metadata categories.
11/// Higher weights indicate more valuable metadata that's harder to recover.
12mod weights {
13    pub const GPS: u32 = 30; // Most valuable, irreplaceable location data
14    pub const TIMEZONE: u32 = 20; // Important for correct timestamps
15    pub const CAMERA_INFO: u32 = 15; // Useful provenance information
16    pub const CAPTURE_TIME: u32 = 15; // Original timestamp
17    pub const LENS_INFO: u32 = 10; // Nice to have
18    pub const LOCATION: u32 = 10; // Reverse-geocoded, derivable from GPS
19}
20
21/// GPS coordinate threshold for conflict detection.
22/// Approximately 11 meters at the equator.
23const GPS_THRESHOLD: f64 = 0.0001;
24
25/// Metadata completeness score for an asset.
26///
27/// Each category contributes a weighted score based on presence of metadata.
28/// Higher total scores indicate more complete metadata.
29#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
30pub struct MetadataScore {
31    /// GPS coordinate score (0 or 30)
32    pub gps: u32,
33
34    /// Timezone score (0 or 20)
35    pub timezone: u32,
36
37    /// Camera make/model score (0 or 15)
38    pub camera_info: u32,
39
40    /// Original capture time score (0 or 15)
41    pub capture_time: u32,
42
43    /// Lens info score (0 or 10)
44    pub lens_info: u32,
45
46    /// Location (city/country) score (0 or 10)
47    pub location: u32,
48
49    /// Total weighted score (sum of all categories)
50    pub total: u32,
51}
52
53impl PartialOrd for MetadataScore {
54    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
55        Some(self.cmp(other))
56    }
57}
58
59impl Ord for MetadataScore {
60    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
61        self.total.cmp(&other.total)
62    }
63}
64
65impl MetadataScore {
66    /// Score an asset based on its metadata completeness.
67    ///
68    /// Uses the `has_*()` helper methods on `ExifInfo` to determine
69    /// which metadata categories are present.
70    pub fn from_asset(asset: &AssetResponse) -> Self {
71        let Some(exif) = &asset.exif_info else {
72            return Self::default();
73        };
74
75        let gps = if exif.has_gps() { weights::GPS } else { 0 };
76        let timezone = if exif.has_timezone() {
77            weights::TIMEZONE
78        } else {
79            0
80        };
81        let camera_info = if exif.has_camera_info() {
82            weights::CAMERA_INFO
83        } else {
84            0
85        };
86        let capture_time = if exif.has_capture_time() {
87            weights::CAPTURE_TIME
88        } else {
89            0
90        };
91        let lens_info = if exif.has_lens_info() {
92            weights::LENS_INFO
93        } else {
94            0
95        };
96        let location = if exif.has_location() {
97            weights::LOCATION
98        } else {
99            0
100        };
101
102        let total = gps + timezone + camera_info + capture_time + lens_info + location;
103
104        Self {
105            gps,
106            timezone,
107            camera_info,
108            capture_time,
109            lens_info,
110            location,
111            total,
112        }
113    }
114}
115
116/// Detected conflict between duplicate assets.
117///
118/// A conflict occurs when multiple assets have different values
119/// for the same metadata field.
120#[derive(Debug, Clone, Serialize, Deserialize)]
121#[serde(tag = "type", rename_all = "snake_case")]
122pub enum MetadataConflict {
123    /// Different GPS coordinates across duplicates
124    Gps {
125        /// List of unique coordinate pairs (latitude, longitude)
126        values: Vec<(f64, f64)>,
127    },
128
129    /// Different timezones across duplicates
130    Timezone {
131        /// List of unique timezone values
132        values: Vec<String>,
133    },
134
135    /// Different camera make/model combinations across duplicates
136    CameraInfo {
137        /// List of unique camera identifiers
138        values: Vec<String>,
139    },
140
141    /// Different original capture times across duplicates
142    CaptureTime {
143        /// List of unique capture timestamps
144        values: Vec<String>,
145    },
146}
147
148/// Detect metadata conflicts across a set of assets.
149///
150/// A conflict is detected when multiple assets have different values
151/// for the same metadata field. This helps identify cases where
152/// automatic selection may lose important information.
153///
154/// # Arguments
155///
156/// * `assets` - Slice of assets to check for conflicts
157///
158/// # Returns
159///
160/// A vector of detected conflicts (empty if no conflicts found)
161pub fn detect_conflicts(assets: &[AssetResponse]) -> Vec<MetadataConflict> {
162    let mut conflicts = Vec::new();
163
164    // Check GPS conflicts
165    let gps_values: Vec<(f64, f64)> = assets
166        .iter()
167        .filter_map(|a| a.exif_info.as_ref())
168        .filter_map(|e| match (e.latitude, e.longitude) {
169            (Some(lat), Some(lon)) => Some((lat, lon)),
170            _ => None,
171        })
172        .collect();
173
174    if has_gps_conflict(&gps_values) {
175        let unique_gps = dedupe_gps(&gps_values);
176        conflicts.push(MetadataConflict::Gps { values: unique_gps });
177    }
178
179    // Check timezone conflicts
180    let timezone_values: Vec<String> = assets
181        .iter()
182        .filter_map(|a| a.exif_info.as_ref())
183        .filter_map(|e| e.time_zone.clone())
184        .collect();
185
186    if let Some(unique) = find_unique_strings(&timezone_values) {
187        conflicts.push(MetadataConflict::Timezone { values: unique });
188    }
189
190    // Check camera info conflicts
191    let camera_values: Vec<String> = assets
192        .iter()
193        .filter_map(|a| a.exif_info.as_ref())
194        .filter_map(|e| {
195            let make = e.make.as_deref().unwrap_or("");
196            let model = e.model.as_deref().unwrap_or("");
197            if make.is_empty() && model.is_empty() {
198                None
199            } else {
200                Some(format!("{} {}", make, model).trim().to_string())
201            }
202        })
203        .collect();
204
205    if let Some(unique) = find_unique_strings(&camera_values) {
206        conflicts.push(MetadataConflict::CameraInfo { values: unique });
207    }
208
209    // Check capture time conflicts
210    let capture_time_values: Vec<String> = assets
211        .iter()
212        .filter_map(|a| a.exif_info.as_ref())
213        .filter_map(|e| e.date_time_original.clone())
214        .collect();
215
216    if let Some(unique) = find_unique_strings(&capture_time_values) {
217        conflicts.push(MetadataConflict::CaptureTime { values: unique });
218    }
219
220    conflicts
221}
222
223/// Check if GPS coordinates have conflicts beyond the threshold.
224fn has_gps_conflict(coords: &[(f64, f64)]) -> bool {
225    if coords.len() < 2 {
226        return false;
227    }
228
229    for i in 0..coords.len() {
230        for j in (i + 1)..coords.len() {
231            let (lat1, lon1) = coords[i];
232            let (lat2, lon2) = coords[j];
233            if (lat1 - lat2).abs() > GPS_THRESHOLD || (lon1 - lon2).abs() > GPS_THRESHOLD {
234                return true;
235            }
236        }
237    }
238
239    false
240}
241
242/// Deduplicate GPS coordinates within threshold.
243fn dedupe_gps(coords: &[(f64, f64)]) -> Vec<(f64, f64)> {
244    let mut unique: Vec<(f64, f64)> = Vec::new();
245
246    for &(lat, lon) in coords {
247        let is_duplicate = unique.iter().any(|&(ulat, ulon)| {
248            (lat - ulat).abs() <= GPS_THRESHOLD && (lon - ulon).abs() <= GPS_THRESHOLD
249        });
250
251        if !is_duplicate {
252            unique.push((lat, lon));
253        }
254    }
255
256    unique
257}
258
259/// Find unique string values (case-insensitive, trimmed).
260/// Returns None if there are 0 or 1 unique values.
261fn find_unique_strings(values: &[String]) -> Option<Vec<String>> {
262    if values.is_empty() {
263        return None;
264    }
265
266    let mut seen: Vec<String> = Vec::new();
267    let mut unique_original: Vec<String> = Vec::new();
268
269    for value in values {
270        let normalized = value.trim().to_lowercase();
271        if !normalized.is_empty() && !seen.contains(&normalized) {
272            seen.push(normalized);
273            unique_original.push(value.trim().to_string());
274        }
275    }
276
277    if unique_original.len() > 1 {
278        Some(unique_original)
279    } else {
280        None
281    }
282}
283
284/// A scored asset with metadata score and file information.
285#[derive(Debug, Clone, Serialize, Deserialize)]
286pub struct ScoredAsset {
287    /// Asset unique identifier
288    pub asset_id: String,
289
290    /// Original filename
291    pub filename: String,
292
293    /// Metadata completeness score (used for consolidation decisions)
294    pub score: MetadataScore,
295
296    /// File size in bytes (secondary tiebreaker)
297    pub file_size: Option<u64>,
298
299    /// Image dimensions (width, height) in pixels - primary selection criteria
300    pub dimensions: Option<(u32, u32)>,
301}
302
303/// Analysis result for a duplicate group.
304///
305/// Contains the selected winner, losers, detected conflicts,
306/// and whether manual review is recommended.
307#[derive(Debug, Clone, Serialize, Deserialize)]
308pub struct DuplicateAnalysis {
309    /// Duplicate group identifier
310    pub duplicate_id: String,
311
312    /// The asset selected as the winner (highest metadata score)
313    pub winner: ScoredAsset,
314
315    /// Assets that should be deleted (lower metadata scores)
316    pub losers: Vec<ScoredAsset>,
317
318    /// Detected metadata conflicts
319    pub conflicts: Vec<MetadataConflict>,
320
321    /// Whether manual review is recommended due to conflicts
322    pub needs_review: bool,
323}
324
325impl DuplicateAnalysis {
326    /// Analyze a duplicate group and select a winner.
327    ///
328    /// The winner is selected based on:
329    /// 1. Largest dimensions (width × height pixels) - best quality
330    /// 2. Largest file size (tiebreaker)
331    /// 3. First in list (stable sort, final tiebreaker)
332    ///
333    /// Metadata scores are still calculated and stored for consolidation decisions.
334    ///
335    /// # Arguments
336    ///
337    /// * `group` - The duplicate group to analyze
338    ///
339    /// # Returns
340    ///
341    /// Analysis result with winner, losers, and conflict information
342    pub fn from_group(group: &DuplicateGroup) -> Self {
343        // Score all assets and capture dimensions
344        let mut scored: Vec<ScoredAsset> = group
345            .assets
346            .iter()
347            .map(|asset| {
348                let dimensions = asset.exif_info.as_ref().and_then(|e| {
349                    match (e.exif_image_width, e.exif_image_height) {
350                        (Some(w), Some(h)) => Some((w, h)),
351                        _ => None,
352                    }
353                });
354                ScoredAsset {
355                    asset_id: asset.id.clone(),
356                    filename: asset.original_file_name.clone(),
357                    score: MetadataScore::from_asset(asset),
358                    file_size: asset.exif_info.as_ref().and_then(|e| e.file_size_in_byte),
359                    dimensions,
360                }
361            })
362            .collect();
363
364        // Sort by dimensions (pixels) descending, then file size descending (stable sort)
365        scored.sort_by(|a, b| {
366            // Primary: largest dimensions (width × height)
367            let pixels_a = a
368                .dimensions
369                .map(|(w, h)| u64::from(w) * u64::from(h))
370                .unwrap_or(0);
371            let pixels_b = b
372                .dimensions
373                .map(|(w, h)| u64::from(w) * u64::from(h))
374                .unwrap_or(0);
375
376            match pixels_b.cmp(&pixels_a) {
377                std::cmp::Ordering::Equal => {
378                    // Secondary: larger file size wins
379                    let size_a = a.file_size.unwrap_or(0);
380                    let size_b = b.file_size.unwrap_or(0);
381                    size_b.cmp(&size_a)
382                }
383                other => other,
384            }
385        });
386
387        // Detect conflicts
388        let conflicts = detect_conflicts(&group.assets);
389        let needs_review = !conflicts.is_empty();
390
391        // Split into winner and losers
392        let winner = scored.remove(0);
393        let losers = scored;
394
395        Self {
396            duplicate_id: group.duplicate_id.clone(),
397            winner,
398            losers,
399            conflicts,
400            needs_review,
401        }
402    }
403}
404
405#[cfg(test)]
406mod tests {
407    use super::*;
408
409    #[test]
410    fn test_metadata_score_default() {
411        let score = MetadataScore::default();
412        assert_eq!(score.total, 0);
413    }
414
415    #[test]
416    fn test_gps_conflict_detection() {
417        // Same coordinates within threshold
418        let coords = vec![(51.5074, -0.1278), (51.5074, -0.1278)];
419        assert!(!has_gps_conflict(&coords));
420
421        // Different coordinates beyond threshold
422        let coords = vec![(51.5074, -0.1278), (52.0, -0.5)];
423        assert!(has_gps_conflict(&coords));
424    }
425
426    #[test]
427    fn test_find_unique_strings() {
428        // Single value
429        let values = vec!["America/New_York".to_string()];
430        assert!(find_unique_strings(&values).is_none());
431
432        // Same values (case-insensitive)
433        let values = vec!["America/New_York".to_string(), "america/new_york".to_string()];
434        assert!(find_unique_strings(&values).is_none());
435
436        // Different values
437        let values = vec!["America/New_York".to_string(), "Europe/London".to_string()];
438        let unique = find_unique_strings(&values).unwrap();
439        assert_eq!(unique.len(), 2);
440    }
441}