#![allow(dead_code)]
use std::collections::HashMap;
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct SegmentHash {
data: [u8; 32],
frame_count: usize,
}
impl SegmentHash {
#[must_use]
pub fn new(data: [u8; 32], frame_count: usize) -> Self {
Self { data, frame_count }
}
#[must_use]
pub fn from_bytes(bytes: &[u8], frame_count: usize) -> Self {
let mut data = [0u8; 32];
let mut state: u64 = 0xcbf2_9ce4_8422_2325; for &b in bytes {
state ^= u64::from(b);
state = state.wrapping_mul(0x0100_0000_01b3); }
let state_bytes = state.to_le_bytes();
data[..8].copy_from_slice(&state_bytes);
for chunk_idx in 1..4u64 {
state ^= chunk_idx;
for &b in bytes {
state ^= u64::from(b);
state = state.wrapping_mul(0x0100_0000_01b3);
}
let s = state.to_le_bytes();
let offset = chunk_idx as usize * 8;
data[offset..offset + 8].copy_from_slice(&s);
}
Self { data, frame_count }
}
#[must_use]
pub fn is_match(&self, other: &Self, max_diff_bits: u32) -> bool {
if self.frame_count != other.frame_count {
return false;
}
let diff: u32 = self
.data
.iter()
.zip(other.data.iter())
.map(|(a, b)| (a ^ b).count_ones())
.sum();
diff <= max_diff_bits
}
#[must_use]
pub fn as_bytes(&self) -> &[u8; 32] {
&self.data
}
#[must_use]
pub fn frame_count(&self) -> usize {
self.frame_count
}
}
#[derive(Debug, Clone)]
pub struct SegmentDedupConfig {
window_size: usize,
stride: usize,
max_diff_bits: u32,
}
impl Default for SegmentDedupConfig {
fn default() -> Self {
Self {
window_size: 30,
stride: 15,
max_diff_bits: 4,
}
}
}
impl SegmentDedupConfig {
#[must_use]
pub fn new(window_size: usize, stride: usize, max_diff_bits: u32) -> Self {
Self {
window_size,
stride,
max_diff_bits,
}
}
#[must_use]
pub fn window_size_frames(&self) -> usize {
self.window_size
}
#[must_use]
pub fn stride_frames(&self) -> usize {
self.stride
}
#[must_use]
pub fn max_diff_bits(&self) -> u32 {
self.max_diff_bits
}
}
#[derive(Debug, Clone)]
pub struct SegmentRecord {
pub source_id: String,
pub frame_offset: usize,
pub hash: SegmentHash,
}
#[derive(Debug, Default)]
pub struct SegmentDeduplicator {
config: SegmentDedupConfig,
index: HashMap<[u8; 32], Vec<SegmentRecord>>,
unique_count: usize,
}
impl SegmentDeduplicator {
#[must_use]
pub fn new() -> Self {
Self::with_config(SegmentDedupConfig::default())
}
#[must_use]
pub fn with_config(config: SegmentDedupConfig) -> Self {
Self {
config,
index: HashMap::new(),
unique_count: 0,
}
}
pub fn add_segment(&mut self, source_id: &str, frame_offset: usize, bytes: &[u8]) {
let hash = SegmentHash::from_bytes(bytes, self.config.window_size);
let key = *hash.as_bytes();
let is_new = !self.index.contains_key(&key);
self.index.entry(key).or_default().push(SegmentRecord {
source_id: source_id.to_string(),
frame_offset,
hash,
});
if is_new {
self.unique_count += 1;
}
}
#[must_use]
pub fn find_duplicates(&self) -> Vec<Vec<&SegmentRecord>> {
self.index
.values()
.filter(|group| group.len() > 1)
.map(|group| group.iter().collect())
.collect()
}
#[must_use]
pub fn unique_count(&self) -> usize {
self.unique_count
}
#[must_use]
pub fn total_count(&self) -> usize {
self.index.values().map(Vec::len).sum()
}
#[must_use]
pub fn config(&self) -> &SegmentDedupConfig {
&self.config
}
}
#[cfg(test)]
mod tests {
use super::*;
fn make_hash(byte: u8, frames: usize) -> SegmentHash {
let mut data = [0u8; 32];
data[0] = byte;
SegmentHash::new(data, frames)
}
#[test]
fn test_segment_hash_is_match_exact() {
let h1 = make_hash(0xAB, 30);
let h2 = make_hash(0xAB, 30);
assert!(h1.is_match(&h2, 0));
}
#[test]
fn test_segment_hash_no_match_different_frames() {
let h1 = make_hash(0xAB, 30);
let h2 = make_hash(0xAB, 60);
assert!(!h1.is_match(&h2, 100));
}
#[test]
fn test_segment_hash_hamming_tolerance() {
let mut d1 = [0u8; 32];
let mut d2 = [0u8; 32];
d1[0] = 0b0000_0001;
d2[0] = 0b0000_0011; let h1 = SegmentHash::new(d1, 30);
let h2 = SegmentHash::new(d2, 30);
assert!(h1.is_match(&h2, 1));
assert!(!h1.is_match(&h2, 0));
}
#[test]
fn test_segment_hash_from_bytes() {
let h = SegmentHash::from_bytes(b"hello world", 15);
assert_eq!(h.frame_count(), 15);
assert_ne!(h.as_bytes(), &[0u8; 32]);
}
#[test]
fn test_config_window_size_frames() {
let cfg = SegmentDedupConfig::new(48, 24, 8);
assert_eq!(cfg.window_size_frames(), 48);
assert_eq!(cfg.stride_frames(), 24);
assert_eq!(cfg.max_diff_bits(), 8);
}
#[test]
fn test_config_default() {
let cfg = SegmentDedupConfig::default();
assert_eq!(cfg.window_size_frames(), 30);
}
#[test]
fn test_add_segment_unique_count() {
let mut dedup = SegmentDeduplicator::new();
dedup.add_segment("source_a", 0, b"segment_content_one");
dedup.add_segment("source_b", 0, b"segment_content_two");
assert_eq!(dedup.unique_count(), 2);
}
#[test]
fn test_add_segment_duplicate_increments_total_not_unique() {
let mut dedup = SegmentDeduplicator::new();
dedup.add_segment("source_a", 0, b"same_content");
dedup.add_segment("source_b", 0, b"same_content");
assert_eq!(dedup.unique_count(), 1);
assert_eq!(dedup.total_count(), 2);
}
#[test]
fn test_find_duplicates_empty() {
let dedup = SegmentDeduplicator::new();
assert!(dedup.find_duplicates().is_empty());
}
#[test]
fn test_find_duplicates_no_dups() {
let mut dedup = SegmentDeduplicator::new();
dedup.add_segment("a", 0, b"aaa");
dedup.add_segment("b", 0, b"bbb");
assert!(dedup.find_duplicates().is_empty());
}
#[test]
fn test_find_duplicates_with_dups() {
let mut dedup = SegmentDeduplicator::new();
dedup.add_segment("src_a", 0, b"identical_bytes");
dedup.add_segment("src_b", 0, b"identical_bytes");
dedup.add_segment("src_c", 30, b"different");
let dups = dedup.find_duplicates();
assert_eq!(dups.len(), 1);
assert_eq!(dups[0].len(), 2);
}
#[test]
fn test_with_config_preserves_config() {
let cfg = SegmentDedupConfig::new(60, 30, 2);
let dedup = SegmentDeduplicator::with_config(cfg);
assert_eq!(dedup.config().window_size_frames(), 60);
}
#[test]
fn test_segment_record_fields() {
let mut dedup = SegmentDeduplicator::new();
dedup.add_segment("my_video.mp4", 120, b"frame_data_xyz");
let total = dedup.total_count();
assert_eq!(total, 1);
}
#[test]
fn test_multiple_sources_multiple_segments() {
let mut dedup = SegmentDeduplicator::new();
for i in 0u8..5 {
dedup.add_segment("fileA", (i as usize) * 30, &[i; 64]);
dedup.add_segment("fileB", (i as usize) * 30, &[i; 64]);
}
assert_eq!(dedup.unique_count(), 5);
assert_eq!(dedup.total_count(), 10);
assert_eq!(dedup.find_duplicates().len(), 5);
}
}