oximedia-dedup 0.1.0

Media deduplication and duplicate detection for OxiMedia
Documentation
//! Content-signature types for robust media identification.
//!
//! Provides `SignatureType`, `ContentSignature`, and `SignatureDatabase`
//! for storing and matching content signatures across a media library.

#![allow(dead_code)]

use std::collections::HashMap;

/// The type of content signature.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum SignatureType {
    /// Perceptual hash derived from visual content.
    PerceptualVisual,
    /// Perceptual hash derived from audio content.
    PerceptualAudio,
    /// Cryptographic (exact) hash of raw bytes.
    Cryptographic,
    /// Fingerprint generated by a machine-learning model.
    NeuralEmbedding,
    /// Lightweight thumbnail-based signature.
    Thumbnail,
}

impl SignatureType {
    /// Return `true` if this signature type is perceptual (approximate matching).
    #[must_use]
    pub const fn is_perceptual(self) -> bool {
        matches!(
            self,
            Self::PerceptualVisual | Self::PerceptualAudio | Self::NeuralEmbedding
        )
    }

    /// Return `true` if this signature supports exact equality matching.
    #[must_use]
    pub const fn supports_exact_match(self) -> bool {
        matches!(self, Self::Cryptographic)
    }

    /// Return a short label for this type.
    #[must_use]
    pub const fn label(self) -> &'static str {
        match self {
            Self::PerceptualVisual => "perceptual-visual",
            Self::PerceptualAudio => "perceptual-audio",
            Self::Cryptographic => "cryptographic",
            Self::NeuralEmbedding => "neural-embedding",
            Self::Thumbnail => "thumbnail",
        }
    }
}

/// A content signature for a single piece of media.
#[derive(Debug, Clone)]
pub struct ContentSignature {
    /// Unique identifier of the media asset.
    pub asset_id: String,
    /// Type of signature.
    pub sig_type: SignatureType,
    /// Raw signature bytes.
    pub data: Vec<u8>,
    /// Optional confidence score (0.0–1.0).
    pub confidence: f64,
}

impl ContentSignature {
    /// Create a new `ContentSignature`.
    #[must_use]
    pub fn new(
        asset_id: impl Into<String>,
        sig_type: SignatureType,
        data: Vec<u8>,
        confidence: f64,
    ) -> Self {
        Self {
            asset_id: asset_id.into(),
            sig_type,
            data,
            confidence,
        }
    }

    /// Return `true` if this signature matches `other` within `tolerance` bytes differing.
    ///
    /// For exact (cryptographic) signatures `tolerance` is ignored and byte-equality is required.
    #[must_use]
    pub fn matches(&self, other: &Self, tolerance: u32) -> bool {
        if self.sig_type != other.sig_type {
            return false;
        }
        if self.data.len() != other.data.len() {
            return false;
        }
        if self.sig_type.supports_exact_match() {
            return self.data == other.data;
        }
        // Perceptual: count differing bytes and compare against tolerance.
        let diff: u32 = self
            .data
            .iter()
            .zip(&other.data)
            .map(|(a, b)| u32::from(*a != *b))
            .sum();
        diff <= tolerance
    }

    /// Return the length of the signature data in bytes.
    #[must_use]
    pub fn data_len(&self) -> usize {
        self.data.len()
    }
}

/// An in-memory database of `ContentSignature` values.
#[derive(Debug, Default)]
pub struct SignatureDatabase {
    entries: HashMap<String, Vec<ContentSignature>>,
}

impl SignatureDatabase {
    /// Create a new, empty database.
    #[must_use]
    pub fn new() -> Self {
        Self::default()
    }

    /// Store a signature, appending it to the list for its `asset_id`.
    pub fn store(&mut self, sig: ContentSignature) {
        self.entries
            .entry(sig.asset_id.clone())
            .or_default()
            .push(sig);
    }

    /// Look up all signatures associated with `asset_id`.
    #[must_use]
    pub fn lookup(&self, asset_id: &str) -> &[ContentSignature] {
        self.entries.get(asset_id).map(Vec::as_slice).unwrap_or(&[])
    }

    /// Return the total number of signatures stored across all assets.
    #[must_use]
    pub fn match_count(&self) -> usize {
        self.entries.values().map(Vec::len).sum()
    }

    /// Find all assets whose signatures match `query` within `tolerance`.
    ///
    /// Returns a list of `(asset_id, matching_signature_count)` pairs.
    #[must_use]
    pub fn find_matches(&self, query: &ContentSignature, tolerance: u32) -> Vec<(String, usize)> {
        self.entries
            .iter()
            .filter_map(|(id, sigs)| {
                let count = sigs.iter().filter(|s| query.matches(s, tolerance)).count();
                if count > 0 && id != &query.asset_id {
                    Some((id.clone(), count))
                } else {
                    None
                }
            })
            .collect()
    }

    /// Remove all signatures for `asset_id`, returning the removed list.
    pub fn remove_asset(&mut self, asset_id: &str) -> Vec<ContentSignature> {
        self.entries.remove(asset_id).unwrap_or_default()
    }

    /// Return the number of distinct assets tracked.
    #[must_use]
    pub fn asset_count(&self) -> usize {
        self.entries.len()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn make_sig(asset_id: &str, sig_type: SignatureType, data: Vec<u8>) -> ContentSignature {
        ContentSignature::new(asset_id, sig_type, data, 1.0)
    }

    #[test]
    fn test_sig_type_is_perceptual_visual() {
        assert!(SignatureType::PerceptualVisual.is_perceptual());
    }

    #[test]
    fn test_sig_type_is_perceptual_audio() {
        assert!(SignatureType::PerceptualAudio.is_perceptual());
    }

    #[test]
    fn test_sig_type_not_perceptual_crypto() {
        assert!(!SignatureType::Cryptographic.is_perceptual());
    }

    #[test]
    fn test_sig_type_supports_exact_match() {
        assert!(SignatureType::Cryptographic.supports_exact_match());
        assert!(!SignatureType::PerceptualVisual.supports_exact_match());
    }

    #[test]
    fn test_sig_type_label_nonempty() {
        for t in [
            SignatureType::PerceptualVisual,
            SignatureType::PerceptualAudio,
            SignatureType::Cryptographic,
            SignatureType::NeuralEmbedding,
            SignatureType::Thumbnail,
        ] {
            assert!(!t.label().is_empty());
        }
    }

    #[test]
    fn test_signature_exact_match_identical() {
        let s1 = make_sig("a1", SignatureType::Cryptographic, vec![1, 2, 3, 4]);
        let s2 = make_sig("a2", SignatureType::Cryptographic, vec![1, 2, 3, 4]);
        assert!(s1.matches(&s2, 0));
    }

    #[test]
    fn test_signature_exact_match_different() {
        let s1 = make_sig("a1", SignatureType::Cryptographic, vec![1, 2, 3, 4]);
        let s2 = make_sig("a2", SignatureType::Cryptographic, vec![1, 2, 3, 5]);
        assert!(!s1.matches(&s2, 0));
    }

    #[test]
    fn test_signature_perceptual_within_tolerance() {
        let s1 = make_sig("a1", SignatureType::PerceptualVisual, vec![0, 0, 0, 0]);
        let s2 = make_sig("a2", SignatureType::PerceptualVisual, vec![1, 0, 0, 0]);
        assert!(s1.matches(&s2, 1));
    }

    #[test]
    fn test_signature_perceptual_exceeds_tolerance() {
        let s1 = make_sig("a1", SignatureType::PerceptualVisual, vec![0, 0, 0, 0]);
        let s2 = make_sig("a2", SignatureType::PerceptualVisual, vec![1, 1, 0, 0]);
        assert!(!s1.matches(&s2, 1));
    }

    #[test]
    fn test_signature_type_mismatch() {
        let s1 = make_sig("a1", SignatureType::PerceptualVisual, vec![0; 4]);
        let s2 = make_sig("a2", SignatureType::Cryptographic, vec![0; 4]);
        assert!(!s1.matches(&s2, 10));
    }

    #[test]
    fn test_database_store_and_lookup() {
        let mut db = SignatureDatabase::new();
        db.store(make_sig(
            "asset1",
            SignatureType::Cryptographic,
            vec![0xAB; 4],
        ));
        let sigs = db.lookup("asset1");
        assert_eq!(sigs.len(), 1);
    }

    #[test]
    fn test_database_lookup_missing() {
        let db = SignatureDatabase::new();
        assert!(db.lookup("nonexistent").is_empty());
    }

    #[test]
    fn test_database_match_count() {
        let mut db = SignatureDatabase::new();
        db.store(make_sig("a", SignatureType::Cryptographic, vec![1; 4]));
        db.store(make_sig("a", SignatureType::PerceptualVisual, vec![1; 4]));
        db.store(make_sig("b", SignatureType::Cryptographic, vec![1; 4]));
        assert_eq!(db.match_count(), 3);
    }

    #[test]
    fn test_database_find_matches() {
        let mut db = SignatureDatabase::new();
        db.store(make_sig(
            "other",
            SignatureType::PerceptualVisual,
            vec![0, 0, 0, 0],
        ));
        let query = make_sig("query", SignatureType::PerceptualVisual, vec![0, 0, 0, 1]);
        let matches = db.find_matches(&query, 1);
        assert_eq!(matches.len(), 1);
        assert_eq!(matches[0].0, "other");
    }

    #[test]
    fn test_database_remove_asset() {
        let mut db = SignatureDatabase::new();
        db.store(make_sig("x", SignatureType::Cryptographic, vec![0; 4]));
        assert_eq!(db.asset_count(), 1);
        let removed = db.remove_asset("x");
        assert_eq!(removed.len(), 1);
        assert_eq!(db.asset_count(), 0);
    }
}