#![warn(missing_docs)]
#![allow(clippy::module_name_repetitions)]
#![allow(clippy::similar_names)]
#![allow(clippy::cast_possible_truncation)]
#![allow(clippy::cast_sign_loss)]
#![allow(clippy::cast_precision_loss)]
#![allow(clippy::too_many_arguments)]
#![allow(dead_code)]
pub mod audio;
pub mod bloom_filter;
pub mod cluster;
pub mod content_id;
pub mod content_signature;
pub mod cross_format;
#[cfg(feature = "sqlite")]
pub mod database;
pub mod dedup_cache;
pub mod dedup_index;
pub mod dedup_policy;
pub mod dedup_report;
pub mod dedup_report_ext;
pub mod dedup_stats;
pub mod frame_hash;
pub mod fuzzy_match;
pub mod hash;
pub mod hash_store;
pub mod incremental;
pub mod lsh_index;
pub mod merge_strategy;
pub mod metadata;
pub mod near_duplicate;
pub mod perceptual_hash;
pub mod phash;
pub mod progress;
pub mod report;
pub mod rolling_hash;
pub mod segment_dedup;
pub mod similarity_index;
pub mod video_dedup;
pub mod video_segment_dedup;
pub mod visual;
#[cfg(feature = "sqlite")]
use std::path::Path;
use std::path::PathBuf;
use thiserror::Error;
#[cfg(feature = "sqlite")]
pub use database::DedupDatabase;
pub use merge_strategy::{AppliedAction, MergeExecutor, MergeReport};
pub use report::{DuplicateGroup, DuplicateReport, SimilarityScore};
#[cfg(feature = "sqlite")]
fn decode_hex_bytes(hex: &str) -> DedupResult<Vec<u8>> {
if hex.len() % 2 != 0 {
return Err(DedupError::Hash(format!(
"odd-length hex string: len={}",
hex.len()
)));
}
(0..hex.len())
.step_by(2)
.map(|i| {
u8::from_str_radix(&hex[i..i + 2], 16)
.map_err(|e| DedupError::Hash(format!("invalid hex byte at {i}: {e}")))
})
.collect()
}
#[cfg(feature = "sqlite")]
fn cosine_similarity(a: &[f64], b: &[f64]) -> f64 {
let dot: f64 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
let mag_a: f64 = a.iter().map(|x| x * x).sum::<f64>().sqrt();
let mag_b: f64 = b.iter().map(|x| x * x).sum::<f64>().sqrt();
if mag_a < f64::EPSILON || mag_b < f64::EPSILON {
return 0.0;
}
dot / (mag_a * mag_b)
}
#[cfg(feature = "sqlite")]
fn group_by_pairwise_similarity<H, FDist, FSim>(
items: &[(String, H)],
max_distance: u32,
dist_fn: FDist,
sim_fn: FSim,
method: &str,
) -> DedupResult<Vec<DuplicateGroup>>
where
FDist: Fn(&H, &H) -> u32,
FSim: Fn(&H, &H) -> f64,
{
let mut groups: Vec<DuplicateGroup> = Vec::new();
let mut assigned = vec![false; items.len()];
for i in 0..items.len() {
if assigned[i] {
continue;
}
let mut group_files = vec![items[i].0.clone()];
let mut best_score = 0.0f64;
for j in (i + 1)..items.len() {
if assigned[j] {
continue;
}
let dist = dist_fn(&items[i].1, &items[j].1);
if dist <= max_distance {
let sim = sim_fn(&items[i].1, &items[j].1);
group_files.push(items[j].0.clone());
assigned[j] = true;
if sim > best_score {
best_score = sim;
}
}
}
if group_files.len() > 1 {
assigned[i] = true;
groups.push(DuplicateGroup {
files: group_files,
scores: vec![SimilarityScore {
method: method.to_string(),
score: best_score,
metadata: Vec::new(),
}],
});
}
}
Ok(groups)
}
#[derive(Error, Debug)]
pub enum DedupError {
#[error("I/O error: {0}")]
Io(#[from] std::io::Error),
#[cfg(feature = "sqlite")]
#[error("Database error: {0}")]
Database(#[from] sqlx::Error),
#[cfg(not(feature = "sqlite"))]
#[error("Database error: {0}")]
Database(String),
#[error("Hashing error: {0}")]
Hash(String),
#[error("Visual processing error: {0}")]
Visual(String),
#[error("Audio processing error: {0}")]
Audio(String),
#[error("Metadata processing error: {0}")]
Metadata(String),
#[error("File not found: {0}")]
FileNotFound(PathBuf),
#[error("Invalid configuration: {0}")]
InvalidConfig(String),
#[error("OxiMedia core error: {0}")]
Core(#[from] oximedia_core::OxiError),
}
pub type DedupResult<T> = Result<T, DedupError>;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DetectionStrategy {
ExactHash,
PerceptualHash,
Ssim,
Histogram,
FeatureMatch,
AudioFingerprint,
Metadata,
All,
VisualAll,
Fast,
}
impl DetectionStrategy {
#[must_use]
pub fn includes_hash(self) -> bool {
matches!(self, Self::ExactHash | Self::All | Self::Fast)
}
#[must_use]
pub fn includes_perceptual(self) -> bool {
matches!(
self,
Self::PerceptualHash | Self::All | Self::VisualAll | Self::Fast
)
}
#[must_use]
pub fn includes_ssim(self) -> bool {
matches!(self, Self::Ssim | Self::All | Self::VisualAll)
}
#[must_use]
pub fn includes_histogram(self) -> bool {
matches!(self, Self::Histogram | Self::All | Self::VisualAll)
}
#[must_use]
pub fn includes_feature_match(self) -> bool {
matches!(self, Self::FeatureMatch | Self::All | Self::VisualAll)
}
#[must_use]
pub fn includes_audio(self) -> bool {
matches!(self, Self::AudioFingerprint | Self::All)
}
#[must_use]
pub fn includes_metadata(self) -> bool {
matches!(self, Self::Metadata | Self::All | Self::Fast)
}
}
#[derive(Debug, Clone)]
pub struct DedupConfig {
pub database_path: PathBuf,
pub perceptual_threshold: f64,
pub ssim_threshold: f64,
pub histogram_threshold: f64,
pub feature_match_threshold: usize,
pub audio_threshold: f64,
pub metadata_threshold: f64,
pub parallel: bool,
pub sample_frames: usize,
pub chunk_size: usize,
pub thumbnail_resolution: usize,
pub bloom_prescreen: bool,
pub bloom_capacity: usize,
pub bloom_fpr: f32,
pub use_lsh: bool,
pub lsh_num_tables: usize,
pub lsh_bits_per_table: usize,
pub lsh_seed: u64,
}
impl Default for DedupConfig {
fn default() -> Self {
Self {
database_path: PathBuf::from("oximedia_dedup.db"),
perceptual_threshold: 0.95,
ssim_threshold: 0.90,
histogram_threshold: 0.85,
feature_match_threshold: 50,
audio_threshold: 0.90,
metadata_threshold: 0.80,
parallel: true,
sample_frames: 10,
chunk_size: 4096,
thumbnail_resolution: 8,
bloom_prescreen: false,
bloom_capacity: 10_000,
bloom_fpr: 0.01,
use_lsh: true,
lsh_num_tables: 8,
lsh_bits_per_table: 8,
lsh_seed: 42,
}
}
}
#[cfg(feature = "sqlite")]
pub struct DuplicateDetector {
config: DedupConfig,
database: DedupDatabase,
bloom: Option<bloom_filter::BloomFilter>,
}
#[cfg(feature = "sqlite")]
impl DuplicateDetector {
pub async fn new(config: DedupConfig) -> DedupResult<Self> {
let database = DedupDatabase::open(&config.database_path).await?;
let bloom = if config.bloom_prescreen {
Some(bloom_filter::BloomFilter::new(
config.bloom_capacity,
config.bloom_fpr,
))
} else {
None
};
Ok(Self {
config,
database,
bloom,
})
}
pub async fn add_file(&mut self, path: impl AsRef<Path>) -> DedupResult<()> {
let path = path.as_ref();
if !path.exists() {
return Err(DedupError::FileNotFound(path.to_path_buf()));
}
let file_hash = hash::compute_file_hash(path)?;
if let Some(ref mut bloom) = self.bloom {
bloom.insert(file_hash.as_bytes());
}
self.database.insert_file(path, &file_hash.to_hex()).await?;
Ok(())
}
pub async fn add_files(&mut self, paths: &[impl AsRef<Path>]) -> DedupResult<Vec<String>> {
let mut errors = Vec::new();
for path in paths {
if let Err(e) = self.add_file(path).await {
errors.push(format!("{}: {}", path.as_ref().display(), e));
}
}
Ok(errors)
}
pub async fn par_index_files<P>(&mut self, paths: &[P]) -> DedupResult<Vec<String>>
where
P: AsRef<Path> + Sync,
{
use rayon::prelude::*;
let hash_results: Vec<(PathBuf, DedupResult<hash::FileHash>)> = paths
.par_iter()
.map(|p| {
let path = p.as_ref().to_path_buf();
if !path.exists() {
return (path.clone(), Err(DedupError::FileNotFound(path)));
}
let result = hash::compute_file_hash(&path);
(path, result)
})
.collect();
let mut errors = Vec::new();
for (path, result) in hash_results {
match result {
Ok(file_hash) => {
if let Err(e) = self.database.insert_file(&path, &file_hash.to_hex()).await {
errors.push(format!("{}: {}", path.display(), e));
}
}
Err(e) => {
errors.push(format!("{}: {}", path.display(), e));
}
}
}
Ok(errors)
}
pub async fn find_duplicates(
&self,
strategy: DetectionStrategy,
) -> DedupResult<DuplicateReport> {
self.find_duplicates_with_progress(strategy, &progress::NullReporter)
.await
}
pub async fn find_duplicates_with_progress(
&self,
strategy: DetectionStrategy,
reporter: &dyn progress::ProgressReporter,
) -> DedupResult<DuplicateReport> {
use progress::{ProgressEvent, ProgressTracker};
let run_start = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.as_millis() as u64;
let mut report = DuplicateReport::new();
let phase_count = [
strategy.includes_hash(),
strategy.includes_perceptual(),
strategy.includes_ssim(),
strategy.includes_histogram(),
strategy.includes_feature_match(),
strategy.includes_audio(),
strategy.includes_metadata(),
]
.iter()
.filter(|&&b| b)
.count();
let mut completed_phases = 0usize;
if strategy.includes_hash() {
if reporter.is_cancelled() {
return Ok(report);
}
let mut tracker = ProgressTracker::new(reporter, "exact_hash", 0);
let hash_dups = self.find_hash_duplicates().await?;
tracker.tick_batch(1);
let groups_found = hash_dups.len();
report.add_groups(hash_dups);
tracker.complete(groups_found);
completed_phases += 1;
}
if strategy.includes_perceptual() {
if reporter.is_cancelled() {
return Ok(report);
}
let mut tracker = ProgressTracker::new(reporter, "perceptual_hash", 0);
let perceptual_dups = self.find_perceptual_duplicates().await?;
tracker.tick_batch(1);
let groups_found = perceptual_dups.len();
report.add_groups(perceptual_dups);
tracker.complete(groups_found);
completed_phases += 1;
}
if strategy.includes_ssim() {
if reporter.is_cancelled() {
return Ok(report);
}
let mut tracker = ProgressTracker::new(reporter, "ssim", 0);
let ssim_dups = self.find_ssim_duplicates().await?;
tracker.tick_batch(1);
let groups_found = ssim_dups.len();
report.add_groups(ssim_dups);
tracker.complete(groups_found);
completed_phases += 1;
}
if strategy.includes_histogram() {
if reporter.is_cancelled() {
return Ok(report);
}
let mut tracker = ProgressTracker::new(reporter, "histogram", 0);
let histogram_dups = self.find_histogram_duplicates().await?;
tracker.tick_batch(1);
let groups_found = histogram_dups.len();
report.add_groups(histogram_dups);
tracker.complete(groups_found);
completed_phases += 1;
}
if strategy.includes_feature_match() {
if reporter.is_cancelled() {
return Ok(report);
}
let mut tracker = ProgressTracker::new(reporter, "feature_match", 0);
let feature_dups = self.find_feature_duplicates().await?;
tracker.tick_batch(1);
let groups_found = feature_dups.len();
report.add_groups(feature_dups);
tracker.complete(groups_found);
completed_phases += 1;
}
if strategy.includes_audio() {
if reporter.is_cancelled() {
return Ok(report);
}
let mut tracker = ProgressTracker::new(reporter, "audio_fingerprint", 0);
let audio_dups = self.find_audio_duplicates().await?;
tracker.tick_batch(1);
let groups_found = audio_dups.len();
report.add_groups(audio_dups);
tracker.complete(groups_found);
completed_phases += 1;
}
if strategy.includes_metadata() {
if reporter.is_cancelled() {
return Ok(report);
}
let mut tracker = ProgressTracker::new(reporter, "metadata", 0);
let metadata_dups = self.find_metadata_duplicates().await?;
tracker.tick_batch(1);
let groups_found = metadata_dups.len();
report.add_groups(metadata_dups);
tracker.complete(groups_found);
completed_phases += 1;
}
let run_end = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.as_millis() as u64;
reporter.on_event(&ProgressEvent::RunCompleted {
total_groups: report.groups.len(),
total_elapsed_ms: run_end.saturating_sub(run_start),
});
let _ = (phase_count, completed_phases);
Ok(report)
}
async fn find_hash_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
let duplicates = self.database.find_duplicate_hashes().await?;
let mut groups = Vec::new();
for (hash, files) in duplicates {
if files.len() > 1 {
groups.push(DuplicateGroup {
files,
scores: vec![SimilarityScore {
method: "exact_hash".to_string(),
score: 1.0,
metadata: vec![("hash".to_string(), hash)],
}],
});
}
}
Ok(groups)
}
async fn find_perceptual_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
let max_hamming = ((1.0 - self.config.perceptual_threshold) * 64.0) as u32;
let stored = self.database.get_all_fingerprints_by_type("phash").await?;
let mut hashes: Vec<(String, visual::PerceptualHash)> = Vec::new();
for (path, hex) in stored {
if let Ok(value) = u64::from_str_radix(&hex, 16) {
hashes.push((path, visual::PerceptualHash::new(value, 64)));
}
}
if hashes.len() < 2 {
return Ok(Vec::new());
}
let hashes: Vec<(String, visual::PerceptualHash)> = if self.config.bloom_prescreen {
let raw: Vec<u64> = hashes.iter().map(|(_, ph)| ph.hash()).collect();
let prescreen = bloom_filter::prescreen_perceptual_hashes(
&raw,
16, self.config.bloom_capacity,
self.config.bloom_fpr,
);
prescreen
.candidates
.iter()
.filter_map(|&idx| hashes.get(idx).cloned())
.collect()
} else {
hashes
};
if hashes.len() < 2 {
return Ok(Vec::new());
}
if self.config.use_lsh {
self.find_perceptual_duplicates_lsh(&hashes, max_hamming)
} else {
group_by_pairwise_similarity(
&hashes,
max_hamming,
|h1, h2| h1.hamming_distance(h2),
|h1, h2| h1.similarity(h2),
"perceptual_hash",
)
}
}
fn find_perceptual_duplicates_lsh(
&self,
hashes: &[(String, visual::PerceptualHash)],
max_hamming: u32,
) -> DedupResult<Vec<DuplicateGroup>> {
let id_hashes: Vec<(u64, u64)> = hashes
.iter()
.enumerate()
.map(|(i, (_, ph))| (i as u64, ph.hash()))
.collect();
let lsh_result = lsh_index::lsh_dedup_pass(
&id_hashes,
max_hamming,
self.config.lsh_num_tables,
self.config.lsh_bits_per_table,
self.config.lsh_seed,
);
let all_ids: Vec<u64> = (0..hashes.len() as u64).collect();
let groups = lsh_index::group_by_lsh_pairs(&lsh_result.pairs, &all_ids);
let mut result = Vec::new();
for group_ids in &groups {
let files: Vec<String> = group_ids
.iter()
.filter_map(|&id| hashes.get(id as usize).map(|(p, _)| p.clone()))
.collect();
if files.len() < 2 {
continue;
}
let mut best_sim = 0.0f64;
for i in 0..group_ids.len() {
for j in (i + 1)..group_ids.len() {
let ia = group_ids[i] as usize;
let ib = group_ids[j] as usize;
if let (Some((_, ha)), Some((_, hb))) = (hashes.get(ia), hashes.get(ib)) {
let sim = ha.similarity(hb);
if sim > best_sim {
best_sim = sim;
}
}
}
}
result.push(DuplicateGroup {
files,
scores: vec![SimilarityScore {
method: "perceptual_hash_lsh".to_string(),
score: best_sim,
metadata: vec![
(
"lsh_candidates".to_string(),
lsh_result.candidates_checked.to_string(),
),
(
"comparison_ratio".to_string(),
format!("{:.4}", lsh_result.comparison_ratio()),
),
],
}],
});
}
Ok(result)
}
async fn find_ssim_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
let threshold = self.config.ssim_threshold;
let res = self.config.thumbnail_resolution.max(4);
let expected_bytes = res * res;
let stored = self
.database
.get_all_fingerprints_by_type("thumbnail")
.await?;
let mut images: Vec<(String, visual::Image)> = Vec::new();
for (path, hex) in stored {
let bytes = decode_hex_bytes(&hex)?;
if bytes.len() == expected_bytes {
if let Ok(img) = visual::Image::from_data(res, res, 1, bytes) {
images.push((path, img));
}
}
}
if images.len() < 2 {
return Ok(Vec::new());
}
let ssim_params = visual::SsimParams::default();
let mut groups: Vec<DuplicateGroup> = Vec::new();
let mut assigned = vec![false; images.len()];
for i in 0..images.len() {
if assigned[i] {
continue;
}
let mut group_files = vec![images[i].0.clone()];
let mut best_score = 0.0f64;
for j in (i + 1)..images.len() {
if assigned[j] {
continue;
}
let ssim = visual::compute_ssim(&images[i].1, &images[j].1, &ssim_params);
if ssim >= threshold {
group_files.push(images[j].0.clone());
assigned[j] = true;
if ssim > best_score {
best_score = ssim;
}
}
}
if group_files.len() > 1 {
assigned[i] = true;
groups.push(DuplicateGroup {
files: group_files,
scores: vec![SimilarityScore {
method: "ssim".to_string(),
score: best_score,
metadata: Vec::new(),
}],
});
}
}
Ok(groups)
}
async fn find_histogram_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
let threshold = self.config.histogram_threshold;
let stored = self
.database
.get_all_fingerprints_by_type("histogram")
.await?;
let mut histograms: Vec<(String, Vec<Vec<u32>>)> = Vec::new();
for (path, json_str) in stored {
if let Ok(flat) = serde_json::from_str::<Vec<u32>>(&json_str) {
if flat.len() % 256 == 0 && !flat.is_empty() {
let channels = flat.len() / 256;
let hist: Vec<Vec<u32>> = (0..channels)
.map(|c| flat[c * 256..(c + 1) * 256].to_vec())
.collect();
histograms.push((path, hist));
}
}
}
if histograms.len() < 2 {
return Ok(Vec::new());
}
let mut groups: Vec<DuplicateGroup> = Vec::new();
let mut assigned = vec![false; histograms.len()];
for i in 0..histograms.len() {
if assigned[i] {
continue;
}
let mut group_files = vec![histograms[i].0.clone()];
let mut best_score = 0.0f64;
for j in (i + 1)..histograms.len() {
if assigned[j] {
continue;
}
let corr = visual::compare_histograms(&histograms[i].1, &histograms[j].1);
if corr >= threshold {
group_files.push(histograms[j].0.clone());
assigned[j] = true;
if corr > best_score {
best_score = corr;
}
}
}
if group_files.len() > 1 {
assigned[i] = true;
groups.push(DuplicateGroup {
files: group_files,
scores: vec![SimilarityScore {
method: "histogram".to_string(),
score: best_score,
metadata: Vec::new(),
}],
});
}
}
Ok(groups)
}
async fn find_feature_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
let threshold = self.config.perceptual_threshold;
let stored = self
.database
.get_all_fingerprints_by_type("feature_vector")
.await?;
let mut vectors: Vec<(String, Vec<f64>)> = Vec::new();
for (path, json_str) in stored {
if let Ok(vec) = serde_json::from_str::<Vec<f64>>(&json_str) {
if !vec.is_empty() {
vectors.push((path, vec));
}
}
}
if vectors.len() < 2 {
return Ok(Vec::new());
}
let mut groups: Vec<DuplicateGroup> = Vec::new();
let mut assigned = vec![false; vectors.len()];
for i in 0..vectors.len() {
if assigned[i] {
continue;
}
let mut group_files = vec![vectors[i].0.clone()];
let mut best_score = 0.0f64;
for j in (i + 1)..vectors.len() {
if assigned[j] {
continue;
}
let sim = cosine_similarity(&vectors[i].1, &vectors[j].1);
if sim >= threshold {
group_files.push(vectors[j].0.clone());
assigned[j] = true;
if sim > best_score {
best_score = sim;
}
}
}
if group_files.len() > 1 {
assigned[i] = true;
groups.push(DuplicateGroup {
files: group_files,
scores: vec![SimilarityScore {
method: "feature_vector".to_string(),
score: best_score,
metadata: Vec::new(),
}],
});
}
}
Ok(groups)
}
async fn find_audio_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
let threshold = self.config.audio_threshold;
let stored = self
.database
.get_all_fingerprints_by_type("audio_fingerprint")
.await?;
let mut fingerprints: Vec<(String, audio::AudioFingerprint)> = Vec::new();
for (path, hex) in stored {
let bytes = decode_hex_bytes(&hex)?;
if !bytes.is_empty() {
fingerprints.push((path, audio::AudioFingerprint::new(bytes, 11025, 0.0)));
}
}
if fingerprints.len() < 2 {
return Ok(Vec::new());
}
let mut groups: Vec<DuplicateGroup> = Vec::new();
let mut assigned = vec![false; fingerprints.len()];
for i in 0..fingerprints.len() {
if assigned[i] {
continue;
}
let mut group_files = vec![fingerprints[i].0.clone()];
let mut best_score = 0.0f64;
for j in (i + 1)..fingerprints.len() {
if assigned[j] {
continue;
}
let sim = fingerprints[i].1.similarity(&fingerprints[j].1);
if sim >= threshold {
group_files.push(fingerprints[j].0.clone());
assigned[j] = true;
if sim > best_score {
best_score = sim;
}
}
}
if group_files.len() > 1 {
assigned[i] = true;
groups.push(DuplicateGroup {
files: group_files,
scores: vec![SimilarityScore {
method: "audio_fingerprint".to_string(),
score: best_score,
metadata: Vec::new(),
}],
});
}
}
Ok(groups)
}
async fn find_metadata_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
use metadata::{compare_metadata, MediaMetadata};
use std::path::PathBuf;
let threshold = self.config.metadata_threshold;
let rows = self.database.get_all_files_with_metadata().await?;
if rows.len() < 2 {
return Ok(Vec::new());
}
let media_meta: Vec<MediaMetadata> = rows
.iter()
.map(
|(path, duration, width, height, video_codec, audio_codec, container)| {
let fs_size = std::fs::metadata(path).map(|m| m.len()).unwrap_or(0);
let mut m = MediaMetadata::new(PathBuf::from(path), fs_size);
m.duration = *duration;
m.width = width.map(|v| v as u32);
m.height = height.map(|v| v as u32);
m.video_codec = video_codec.clone();
m.audio_codec = audio_codec.clone();
m.container = container.clone();
m
},
)
.collect();
let paths: Vec<String> = rows.iter().map(|(p, ..)| p.clone()).collect();
let mut groups: Vec<DuplicateGroup> = Vec::new();
let mut assigned = vec![false; media_meta.len()];
for i in 0..media_meta.len() {
if assigned[i] {
continue;
}
let mut group_files = vec![paths[i].clone()];
let mut best_score = 0.0f64;
let mut best_duration_diff: Option<f64> = None;
for j in (i + 1)..media_meta.len() {
if assigned[j] {
continue;
}
let duration_ok = match (media_meta[i].duration, media_meta[j].duration) {
(Some(d1), Some(d2)) => (d1 - d2).abs() <= 1.0,
_ => true, };
if !duration_ok {
continue;
}
let sim = compare_metadata(&media_meta[i], &media_meta[j]);
let score = sim.overall_score();
if score >= threshold {
group_files.push(paths[j].clone());
assigned[j] = true;
if score > best_score {
best_score = score;
best_duration_diff = match (media_meta[i].duration, media_meta[j].duration)
{
(Some(d1), Some(d2)) => Some((d1 - d2).abs()),
_ => None,
};
}
}
}
if group_files.len() > 1 {
assigned[i] = true;
let mut score_entry = SimilarityScore {
method: "metadata".to_string(),
score: best_score,
metadata: Vec::new(),
};
if let Some(diff) = best_duration_diff {
score_entry
.metadata
.push(("duration_diff_secs".to_string(), format!("{diff:.3}")));
}
groups.push(DuplicateGroup {
files: group_files,
scores: vec![score_entry],
});
}
}
Ok(groups)
}
pub async fn get_stats(&self) -> DedupResult<DedupStats> {
let total_files = self.database.count_files().await?;
let total_hashes = self.database.count_unique_hashes().await?;
Ok(DedupStats {
total_files,
total_hashes,
duplicate_files: total_files.saturating_sub(total_hashes),
})
}
pub async fn close(self) -> DedupResult<()> {
self.database.close().await?;
Ok(())
}
#[must_use]
pub fn might_be_duplicate(&self, hash_bytes: &[u8]) -> bool {
match &self.bloom {
Some(bloom) => bloom.contains(hash_bytes),
None => true,
}
}
pub fn reset_bloom(&mut self) {
if let Some(ref mut bloom) = self.bloom {
bloom.clear();
}
}
}
#[derive(Debug, Clone)]
pub struct DedupStats {
pub total_files: usize,
pub total_hashes: usize,
pub duplicate_files: usize,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_detection_strategy() {
assert!(DetectionStrategy::ExactHash.includes_hash());
assert!(!DetectionStrategy::ExactHash.includes_perceptual());
assert!(DetectionStrategy::All.includes_hash());
assert!(DetectionStrategy::All.includes_perceptual());
assert!(DetectionStrategy::All.includes_audio());
assert!(DetectionStrategy::Fast.includes_hash());
assert!(DetectionStrategy::Fast.includes_perceptual());
assert!(!DetectionStrategy::Fast.includes_ssim());
}
#[test]
fn test_config_default() {
let config = DedupConfig::default();
assert_eq!(config.perceptual_threshold, 0.95);
assert_eq!(config.ssim_threshold, 0.90);
assert!(config.parallel);
}
#[test]
fn test_config_lsh_defaults() {
let config = DedupConfig::default();
assert!(config.use_lsh);
assert_eq!(config.lsh_num_tables, 8);
assert_eq!(config.lsh_bits_per_table, 8);
assert_eq!(config.lsh_seed, 42);
}
#[test]
fn test_config_bloom_defaults() {
let config = DedupConfig::default();
assert!(!config.bloom_prescreen);
assert_eq!(config.bloom_capacity, 10_000);
assert!((config.bloom_fpr - 0.01f32).abs() < f32::EPSILON);
}
#[tokio::test]
#[cfg(feature = "sqlite")]
async fn test_par_index_files_empty_slice() {
use std::path::PathBuf;
let dir = std::env::temp_dir();
let db_path = dir.join(format!(
"oxidedup_test_par_{}.db",
std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.subsec_nanos()
));
let config = DedupConfig {
database_path: db_path.clone(),
..DedupConfig::default()
};
if let Ok(mut detector) = DuplicateDetector::new(config).await {
let no_paths: &[PathBuf] = &[];
let errors = detector
.par_index_files(no_paths)
.await
.expect("par_index_files should succeed on empty input");
assert!(errors.is_empty(), "No errors expected for empty input");
let _ = detector.close().await;
}
let _ = std::fs::remove_file(&db_path);
}
#[tokio::test]
#[cfg(feature = "sqlite")]
async fn test_par_index_files_nonexistent_paths() {
let dir = std::env::temp_dir();
let db_path = dir.join(format!(
"oxidedup_test_par_ne_{}.db",
std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.subsec_nanos()
));
let config = DedupConfig {
database_path: db_path.clone(),
..DedupConfig::default()
};
if let Ok(mut detector) = DuplicateDetector::new(config).await {
let missing = vec![
PathBuf::from("/nonexistent/path/a.mp4"),
PathBuf::from("/nonexistent/path/b.mp4"),
];
let errors = detector
.par_index_files(&missing)
.await
.expect("par_index_files should return Ok even when files are missing");
assert_eq!(errors.len(), 2, "Should have one error per missing file");
let _ = detector.close().await;
}
let _ = std::fs::remove_file(&db_path);
}
#[tokio::test]
#[cfg(feature = "sqlite")]
async fn test_might_be_duplicate_no_bloom_always_true() {
let dir = std::env::temp_dir();
let db_path = dir.join(format!(
"oxidedup_bloom_noscreen_{}.db",
std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.subsec_nanos()
));
let config = DedupConfig {
database_path: db_path.clone(),
bloom_prescreen: false,
..DedupConfig::default()
};
if let Ok(detector) = DuplicateDetector::new(config).await {
assert!(
detector.might_be_duplicate(b"some_hash_bytes"),
"Should always return true when bloom is disabled"
);
assert!(
detector.might_be_duplicate(b""),
"Empty bytes: should return true without bloom"
);
let _ = detector.close().await;
}
let _ = std::fs::remove_file(&db_path);
}
#[tokio::test]
#[cfg(feature = "sqlite")]
async fn test_might_be_duplicate_with_bloom_unknown_hash() {
let dir = std::env::temp_dir();
let db_path = dir.join(format!(
"oxidedup_bloom_unknown_{}.db",
std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.subsec_nanos()
));
let config = DedupConfig {
database_path: db_path.clone(),
bloom_prescreen: true,
bloom_capacity: 1000,
bloom_fpr: 0.01,
..DedupConfig::default()
};
if let Ok(detector) = DuplicateDetector::new(config).await {
assert!(
!detector.might_be_duplicate(b"never_inserted_hash"),
"Unknown hash should return false from a fresh bloom filter"
);
let _ = detector.close().await;
}
let _ = std::fs::remove_file(&db_path);
}
#[tokio::test]
#[cfg(feature = "sqlite")]
async fn test_reset_bloom_clears_state() {
let dir = std::env::temp_dir();
let db_path = dir.join(format!(
"oxidedup_bloom_reset_{}.db",
std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.subsec_nanos()
));
let config = DedupConfig {
database_path: db_path.clone(),
bloom_prescreen: true,
bloom_capacity: 1000,
bloom_fpr: 0.01,
..DedupConfig::default()
};
if let Ok(mut detector) = DuplicateDetector::new(config).await {
if let Some(ref mut bloom) = detector.bloom {
bloom.insert(b"known_hash");
}
assert!(
detector.might_be_duplicate(b"known_hash"),
"After insert, bloom should report potential duplicate"
);
detector.reset_bloom();
assert!(
!detector.might_be_duplicate(b"known_hash"),
"After reset_bloom, hash should not be found"
);
let _ = detector.close().await;
}
let _ = std::fs::remove_file(&db_path);
}
}