#![warn(missing_docs)]
#![allow(clippy::module_name_repetitions)]
#![allow(clippy::similar_names)]
#![allow(clippy::cast_possible_truncation)]
#![allow(clippy::cast_sign_loss)]
#![allow(clippy::cast_precision_loss)]
#![allow(clippy::too_many_arguments)]
#![allow(dead_code)]
pub mod audio;
pub mod bloom_filter;
pub mod cluster;
pub mod content_id;
pub mod content_signature;
pub mod database;
pub mod dedup_cache;
pub mod dedup_index;
pub mod dedup_policy;
pub mod dedup_report;
pub mod dedup_report_ext;
pub mod dedup_stats;
pub mod frame_hash;
pub mod fuzzy_match;
pub mod hash;
pub mod hash_store;
pub mod lsh_index;
pub mod merge_strategy;
pub mod metadata;
pub mod near_duplicate;
pub mod perceptual_hash;
pub mod phash;
pub mod report;
pub mod rolling_hash;
pub mod segment_dedup;
pub mod similarity_index;
pub mod video_dedup;
pub mod visual;
use std::path::{Path, PathBuf};
use thiserror::Error;
pub use database::DedupDatabase;
pub use report::{DuplicateGroup, DuplicateReport, SimilarityScore};
#[derive(Error, Debug)]
pub enum DedupError {
#[error("I/O error: {0}")]
Io(#[from] std::io::Error),
#[error("Database error: {0}")]
Database(#[from] sqlx::Error),
#[error("Hashing error: {0}")]
Hash(String),
#[error("Visual processing error: {0}")]
Visual(String),
#[error("Audio processing error: {0}")]
Audio(String),
#[error("Metadata processing error: {0}")]
Metadata(String),
#[error("File not found: {0}")]
FileNotFound(PathBuf),
#[error("Invalid configuration: {0}")]
InvalidConfig(String),
#[error("OxiMedia core error: {0}")]
Core(#[from] oximedia_core::OxiError),
}
pub type DedupResult<T> = Result<T, DedupError>;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DetectionStrategy {
ExactHash,
PerceptualHash,
Ssim,
Histogram,
FeatureMatch,
AudioFingerprint,
Metadata,
All,
VisualAll,
Fast,
}
impl DetectionStrategy {
#[must_use]
pub fn includes_hash(self) -> bool {
matches!(self, Self::ExactHash | Self::All | Self::Fast)
}
#[must_use]
pub fn includes_perceptual(self) -> bool {
matches!(
self,
Self::PerceptualHash | Self::All | Self::VisualAll | Self::Fast
)
}
#[must_use]
pub fn includes_ssim(self) -> bool {
matches!(self, Self::Ssim | Self::All | Self::VisualAll)
}
#[must_use]
pub fn includes_histogram(self) -> bool {
matches!(self, Self::Histogram | Self::All | Self::VisualAll)
}
#[must_use]
pub fn includes_feature_match(self) -> bool {
matches!(self, Self::FeatureMatch | Self::All | Self::VisualAll)
}
#[must_use]
pub fn includes_audio(self) -> bool {
matches!(self, Self::AudioFingerprint | Self::All)
}
#[must_use]
pub fn includes_metadata(self) -> bool {
matches!(self, Self::Metadata | Self::All | Self::Fast)
}
}
#[derive(Debug, Clone)]
pub struct DedupConfig {
pub database_path: PathBuf,
pub perceptual_threshold: f64,
pub ssim_threshold: f64,
pub histogram_threshold: f64,
pub feature_match_threshold: usize,
pub audio_threshold: f64,
pub metadata_threshold: f64,
pub parallel: bool,
pub sample_frames: usize,
pub chunk_size: usize,
}
impl Default for DedupConfig {
fn default() -> Self {
Self {
database_path: PathBuf::from("oximedia_dedup.db"),
perceptual_threshold: 0.95,
ssim_threshold: 0.90,
histogram_threshold: 0.85,
feature_match_threshold: 50,
audio_threshold: 0.90,
metadata_threshold: 0.80,
parallel: true,
sample_frames: 10,
chunk_size: 4096,
}
}
}
pub struct DuplicateDetector {
config: DedupConfig,
database: DedupDatabase,
}
impl DuplicateDetector {
pub async fn new(config: DedupConfig) -> DedupResult<Self> {
let database = DedupDatabase::open(&config.database_path).await?;
Ok(Self { config, database })
}
pub async fn add_file(&mut self, path: impl AsRef<Path>) -> DedupResult<()> {
let path = path.as_ref();
if !path.exists() {
return Err(DedupError::FileNotFound(path.to_path_buf()));
}
let file_hash = hash::compute_file_hash(path)?;
self.database.insert_file(path, &file_hash.to_hex()).await?;
Ok(())
}
pub async fn add_files(&mut self, paths: &[impl AsRef<Path>]) -> DedupResult<Vec<String>> {
let mut errors = Vec::new();
for path in paths {
if let Err(e) = self.add_file(path).await {
errors.push(format!("{}: {}", path.as_ref().display(), e));
}
}
Ok(errors)
}
pub async fn find_duplicates(
&self,
strategy: DetectionStrategy,
) -> DedupResult<DuplicateReport> {
let mut report = DuplicateReport::new();
if strategy.includes_hash() {
let hash_dups = self.find_hash_duplicates().await?;
report.add_groups(hash_dups);
}
if strategy.includes_perceptual() {
let perceptual_dups = self.find_perceptual_duplicates().await?;
report.add_groups(perceptual_dups);
}
if strategy.includes_ssim() {
let ssim_dups = self.find_ssim_duplicates().await?;
report.add_groups(ssim_dups);
}
if strategy.includes_histogram() {
let histogram_dups = self.find_histogram_duplicates().await?;
report.add_groups(histogram_dups);
}
if strategy.includes_feature_match() {
let feature_dups = self.find_feature_duplicates().await?;
report.add_groups(feature_dups);
}
if strategy.includes_audio() {
let audio_dups = self.find_audio_duplicates().await?;
report.add_groups(audio_dups);
}
if strategy.includes_metadata() {
let metadata_dups = self.find_metadata_duplicates().await?;
report.add_groups(metadata_dups);
}
Ok(report)
}
async fn find_hash_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
let duplicates = self.database.find_duplicate_hashes().await?;
let mut groups = Vec::new();
for (hash, files) in duplicates {
if files.len() > 1 {
groups.push(DuplicateGroup {
files,
scores: vec![SimilarityScore {
method: "exact_hash".to_string(),
score: 1.0,
metadata: vec![("hash".to_string(), hash)],
}],
});
}
}
Ok(groups)
}
async fn find_perceptual_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
Ok(Vec::new())
}
async fn find_ssim_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
Ok(Vec::new())
}
async fn find_histogram_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
Ok(Vec::new())
}
async fn find_feature_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
Ok(Vec::new())
}
async fn find_audio_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
Ok(Vec::new())
}
async fn find_metadata_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
Ok(Vec::new())
}
pub async fn get_stats(&self) -> DedupResult<DedupStats> {
let total_files = self.database.count_files().await?;
let total_hashes = self.database.count_unique_hashes().await?;
Ok(DedupStats {
total_files,
total_hashes,
duplicate_files: total_files.saturating_sub(total_hashes),
})
}
pub async fn close(self) -> DedupResult<()> {
self.database.close().await?;
Ok(())
}
}
#[derive(Debug, Clone)]
pub struct DedupStats {
pub total_files: usize,
pub total_hashes: usize,
pub duplicate_files: usize,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_detection_strategy() {
assert!(DetectionStrategy::ExactHash.includes_hash());
assert!(!DetectionStrategy::ExactHash.includes_perceptual());
assert!(DetectionStrategy::All.includes_hash());
assert!(DetectionStrategy::All.includes_perceptual());
assert!(DetectionStrategy::All.includes_audio());
assert!(DetectionStrategy::Fast.includes_hash());
assert!(DetectionStrategy::Fast.includes_perceptual());
assert!(!DetectionStrategy::Fast.includes_ssim());
}
#[test]
fn test_config_default() {
let config = DedupConfig::default();
assert_eq!(config.perceptual_threshold, 0.95);
assert_eq!(config.ssim_threshold, 0.90);
assert!(config.parallel);
}
}