#![allow(dead_code)]
use std::collections::HashMap;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct SegmentFingerprint {
pub hash: u64,
pub frame_count: usize,
pub duration_ms: u64,
}
impl SegmentFingerprint {
#[must_use]
pub fn new(hash: u64, frame_count: usize, duration_ms: u64) -> Self {
Self {
hash,
frame_count,
duration_ms,
}
}
#[must_use]
pub fn from_frame_data(data: &[u8], frame_count: usize, duration_ms: u64) -> Self {
let hash = perceptual_hash_u64(data);
Self {
hash,
frame_count,
duration_ms,
}
}
#[must_use]
pub fn hamming_distance(&self, other: &Self) -> u32 {
(self.hash ^ other.hash).count_ones()
}
#[must_use]
pub fn ms_per_frame(&self) -> u64 {
if self.frame_count == 0 {
0
} else {
self.duration_ms / self.frame_count as u64
}
}
}
fn perceptual_hash_u64(data: &[u8]) -> u64 {
if data.is_empty() {
return 0;
}
const SEGS: usize = 65;
let seg_size = (data.len() + SEGS - 1) / SEGS;
let mut seg_vals = [0u64; SEGS];
for (i, chunk) in data.chunks(seg_size.max(1)).enumerate() {
if i >= SEGS {
break;
}
let fnv_offset: u64 =
0xcbf2_9ce4_8422_2325u64.wrapping_add((i as u64).wrapping_mul(0x9e37_79b9_7f4a_7c15));
let mut state: u64 = fnv_offset;
for &b in chunk {
state ^= u64::from(b);
state = state.wrapping_mul(0x0100_0000_01b3);
}
seg_vals[i] = state;
}
let mut hash = 0u64;
for i in 0..64 {
if seg_vals[i] > seg_vals[i + 1] {
hash |= 1u64 << i;
}
}
hash
}
#[must_use]
pub fn match_segments(a: &SegmentFingerprint, b: &SegmentFingerprint) -> f32 {
let differing_bits = (a.hash ^ b.hash).count_ones();
1.0 - (differing_bits as f32 / 64.0)
}
#[derive(Debug, Clone)]
pub struct TemporalWindow {
pub hashes: Vec<u64>,
pub start_idx: usize,
pub duration_ms: u64,
}
impl TemporalWindow {
#[must_use]
pub fn from_fingerprints(fps: &[SegmentFingerprint], start_idx: usize) -> Self {
let hashes = fps.iter().map(|f| f.hash).collect();
let duration_ms = fps.iter().map(|f| f.duration_ms).sum();
Self {
hashes,
start_idx,
duration_ms,
}
}
#[must_use]
pub fn similarity(&self, other: &Self) -> f32 {
if self.hashes.is_empty() || other.hashes.is_empty() {
return 0.0;
}
if self.hashes.len() != other.hashes.len() {
return 0.0;
}
let total: f32 = self
.hashes
.iter()
.zip(other.hashes.iter())
.map(|(&a, &b)| {
let diff = (a ^ b).count_ones();
1.0 - (diff as f32 / 64.0)
})
.sum();
total / self.hashes.len() as f32
}
}
#[derive(Debug, Clone)]
pub struct TemporalWindowMatcher {
pub window_size: usize,
pub stride: usize,
}
impl TemporalWindowMatcher {
#[must_use]
pub fn new(window_size: usize, stride: usize) -> Self {
Self {
window_size: window_size.max(1),
stride: stride.max(1),
}
}
#[must_use]
pub fn with_defaults() -> Self {
Self::new(4, 2)
}
#[must_use]
pub fn compare_sequences(&self, a: &[SegmentFingerprint], b: &[SegmentFingerprint]) -> f32 {
if a.len() < self.window_size || b.len() < self.window_size {
return 0.0;
}
let windows_a = self.extract_windows(a);
let windows_b = self.extract_windows(b);
let mut best = 0.0f32;
for wa in &windows_a {
for wb in &windows_b {
let sim = wa.similarity(wb);
if sim > best {
best = sim;
}
}
}
best
}
#[must_use]
pub fn find_best_alignment(
&self,
a: &[SegmentFingerprint],
b: &[SegmentFingerprint],
) -> Option<(usize, usize, f32)> {
if a.len() < self.window_size || b.len() < self.window_size {
return None;
}
let windows_a = self.extract_windows(a);
let windows_b = self.extract_windows(b);
let mut best_sim = 0.0f32;
let mut best_offset_a = 0usize;
let mut best_offset_b = 0usize;
for wa in &windows_a {
for wb in &windows_b {
let sim = wa.similarity(wb);
if sim > best_sim {
best_sim = sim;
best_offset_a = wa.start_idx;
best_offset_b = wb.start_idx;
}
}
}
if best_sim > 0.0 {
Some((best_offset_a, best_offset_b, best_sim))
} else {
None
}
}
fn extract_windows(&self, seq: &[SegmentFingerprint]) -> Vec<TemporalWindow> {
if seq.len() < self.window_size {
return Vec::new();
}
let mut windows = Vec::new();
let mut start = 0;
while start + self.window_size <= seq.len() {
let slice = &seq[start..start + self.window_size];
windows.push(TemporalWindow::from_fingerprints(slice, start));
start += self.stride;
}
windows
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct SegmentMatch {
pub video_a: String,
pub segment_a_idx: usize,
pub video_b: String,
pub segment_b_idx: usize,
pub similarity: f32,
pub time_offset_ms: i64,
}
#[derive(Debug, Default)]
pub struct VideoSegmentDeduplicator {
videos: HashMap<String, Vec<SegmentFingerprint>>,
threshold: f32,
}
impl VideoSegmentDeduplicator {
#[must_use]
pub fn new() -> Self {
Self {
videos: HashMap::new(),
threshold: 0.8,
}
}
#[must_use]
pub fn with_threshold(threshold: f32) -> Self {
Self {
videos: HashMap::new(),
threshold: threshold.clamp(0.0, 1.0),
}
}
pub fn add_video(&mut self, video_id: &str, fingerprints: Vec<SegmentFingerprint>) {
self.videos.insert(video_id.to_owned(), fingerprints);
}
#[must_use]
pub fn video_count(&self) -> usize {
self.videos.len()
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.videos.is_empty()
}
#[must_use]
pub fn find_duplicate_segments(&self) -> Vec<SegmentMatch> {
self.find_duplicate_segments_with_threshold(self.threshold)
}
#[must_use]
pub fn find_duplicate_segments_with_threshold(&self, threshold: f32) -> Vec<SegmentMatch> {
let mut matches = Vec::new();
let video_ids: Vec<&String> = self.videos.keys().collect();
let n = video_ids.len();
for i in 0..n {
for j in (i + 1)..n {
let id_a = video_ids[i];
let id_b = video_ids[j];
let fps_a = &self.videos[id_a];
let fps_b = &self.videos[id_b];
let mut video_matches =
compare_segment_sequences(id_a, fps_a, id_b, fps_b, threshold);
matches.append(&mut video_matches);
}
}
matches
}
#[must_use]
pub fn query(
&self,
video_id: &str,
fingerprints: &[SegmentFingerprint],
threshold: f32,
) -> Vec<SegmentMatch> {
let mut matches = Vec::new();
for (indexed_id, indexed_fps) in &self.videos {
if indexed_id == video_id {
continue;
}
let mut m = compare_segment_sequences(
video_id,
fingerprints,
indexed_id,
indexed_fps,
threshold,
);
matches.append(&mut m);
}
matches
}
}
fn compare_segment_sequences(
id_a: &str,
fps_a: &[SegmentFingerprint],
id_b: &str,
fps_b: &[SegmentFingerprint],
threshold: f32,
) -> Vec<SegmentMatch> {
let mut matches = Vec::new();
for (i, fp_a) in fps_a.iter().enumerate() {
for (j, fp_b) in fps_b.iter().enumerate() {
let sim = match_segments(fp_a, fp_b);
if sim >= threshold {
let time_a: i64 = fps_a[..i].iter().map(|f| f.duration_ms as i64).sum();
let time_b: i64 = fps_b[..j].iter().map(|f| f.duration_ms as i64).sum();
let time_offset_ms = time_b - time_a;
matches.push(SegmentMatch {
video_a: id_a.to_owned(),
segment_a_idx: i,
video_b: id_b.to_owned(),
segment_b_idx: j,
similarity: sim,
time_offset_ms,
});
}
}
}
matches
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_segment_fingerprint_new() {
let fp = SegmentFingerprint::new(0xDEAD_BEEF_1234_5678, 30, 1000);
assert_eq!(fp.hash, 0xDEAD_BEEF_1234_5678);
assert_eq!(fp.frame_count, 30);
assert_eq!(fp.duration_ms, 1000);
}
#[test]
fn test_from_frame_data_deterministic() {
let data = b"hello video frame data here";
let fp1 = SegmentFingerprint::from_frame_data(data, 10, 500);
let fp2 = SegmentFingerprint::from_frame_data(data, 10, 500);
assert_eq!(fp1.hash, fp2.hash);
assert_eq!(fp1.frame_count, 10);
assert_eq!(fp1.duration_ms, 500);
}
#[test]
fn test_from_frame_data_different_inputs_differ() {
let data_a: Vec<u8> = (0u8..=127).collect();
let data_b: Vec<u8> = (128u8..=255).collect();
let fp_a = SegmentFingerprint::from_frame_data(&data_a, 10, 500);
let fp_b = SegmentFingerprint::from_frame_data(&data_b, 10, 500);
assert_ne!(fp_a.hash, fp_b.hash);
}
#[test]
fn test_from_frame_data_empty() {
let fp = SegmentFingerprint::from_frame_data(b"", 0, 0);
assert_eq!(fp.hash, 0);
assert_eq!(fp.frame_count, 0);
}
#[test]
fn test_hamming_distance_identical() {
let fp = SegmentFingerprint::new(0xFFFF_FFFF_FFFF_FFFF, 30, 1000);
assert_eq!(fp.hamming_distance(&fp), 0);
}
#[test]
fn test_hamming_distance_all_differ() {
let fp_a = SegmentFingerprint::new(0x0000_0000_0000_0000, 30, 1000);
let fp_b = SegmentFingerprint::new(0xFFFF_FFFF_FFFF_FFFF, 30, 1000);
assert_eq!(fp_a.hamming_distance(&fp_b), 64);
}
#[test]
fn test_ms_per_frame() {
let fp = SegmentFingerprint::new(0, 30, 1000);
assert_eq!(fp.ms_per_frame(), 33); }
#[test]
fn test_ms_per_frame_zero_frames() {
let fp = SegmentFingerprint::new(0, 0, 1000);
assert_eq!(fp.ms_per_frame(), 0);
}
#[test]
fn test_match_segments_identical() {
let fp = SegmentFingerprint::new(0x1234_5678_ABCD_EF01, 30, 1000);
let sim = match_segments(&fp, &fp);
assert!((sim - 1.0).abs() < f32::EPSILON);
}
#[test]
fn test_match_segments_completely_different() {
let fp_a = SegmentFingerprint::new(0x0000_0000_0000_0000, 30, 1000);
let fp_b = SegmentFingerprint::new(0xFFFF_FFFF_FFFF_FFFF, 30, 1000);
let sim = match_segments(&fp_a, &fp_b);
assert!((sim - 0.0).abs() < f32::EPSILON);
}
#[test]
fn test_match_segments_half_different() {
let fp_a = SegmentFingerprint::new(0x0000_0000_FFFF_FFFF, 30, 1000);
let fp_b = SegmentFingerprint::new(0xFFFF_FFFF_0000_0000, 30, 1000);
let sim = match_segments(&fp_a, &fp_b);
assert!((sim - 0.0).abs() < f32::EPSILON);
}
#[test]
fn test_match_segments_near_identical() {
let fp_a = SegmentFingerprint::new(0b1000, 30, 1000);
let fp_b = SegmentFingerprint::new(0b1001, 30, 1000); let sim = match_segments(&fp_a, &fp_b);
let expected = 1.0 - (1.0 / 64.0);
assert!((sim - expected).abs() < 0.001);
}
#[test]
fn test_match_segments_ignores_frame_count() {
let hash = 0xCAFE_BABE_DEAD_BEEF;
let fp_a = SegmentFingerprint::new(hash, 10, 500);
let fp_b = SegmentFingerprint::new(hash, 99, 9999);
let sim = match_segments(&fp_a, &fp_b);
assert!((sim - 1.0).abs() < f32::EPSILON);
}
#[test]
fn test_temporal_window_from_fingerprints() {
let fps = vec![
SegmentFingerprint::new(0xAA, 10, 300),
SegmentFingerprint::new(0xBB, 10, 400),
SegmentFingerprint::new(0xCC, 10, 500),
];
let win = TemporalWindow::from_fingerprints(&fps, 5);
assert_eq!(win.hashes, vec![0xAA, 0xBB, 0xCC]);
assert_eq!(win.start_idx, 5);
assert_eq!(win.duration_ms, 1200);
}
#[test]
fn test_temporal_window_similarity_identical() {
let fps = vec![
SegmentFingerprint::new(0x11, 10, 100),
SegmentFingerprint::new(0x22, 10, 100),
];
let w1 = TemporalWindow::from_fingerprints(&fps, 0);
let w2 = TemporalWindow::from_fingerprints(&fps, 0);
assert!((w1.similarity(&w2) - 1.0).abs() < 0.001);
}
#[test]
fn test_temporal_window_similarity_different_lengths() {
let fps_a = vec![SegmentFingerprint::new(0x11, 10, 100)];
let fps_b = vec![
SegmentFingerprint::new(0x11, 10, 100),
SegmentFingerprint::new(0x22, 10, 100),
];
let w1 = TemporalWindow::from_fingerprints(&fps_a, 0);
let w2 = TemporalWindow::from_fingerprints(&fps_b, 0);
assert!((w1.similarity(&w2) - 0.0).abs() < f32::EPSILON);
}
#[test]
fn test_temporal_window_similarity_empty() {
let w1 = TemporalWindow {
hashes: vec![],
start_idx: 0,
duration_ms: 0,
};
let w2 = TemporalWindow {
hashes: vec![],
start_idx: 0,
duration_ms: 0,
};
assert!((w1.similarity(&w2) - 0.0).abs() < f32::EPSILON);
}
#[test]
fn test_matcher_compare_identical_sequences() {
let fps: Vec<SegmentFingerprint> = (0..8)
.map(|i| SegmentFingerprint::new(i * 0x1111_1111_1111_1111, 10, 100))
.collect();
let matcher = TemporalWindowMatcher::new(4, 2);
let sim = matcher.compare_sequences(&fps, &fps);
assert!((sim - 1.0).abs() < 0.001);
}
#[test]
fn test_matcher_compare_too_short() {
let fps = vec![SegmentFingerprint::new(0xAB, 10, 100)];
let matcher = TemporalWindowMatcher::new(4, 2);
let sim = matcher.compare_sequences(&fps, &fps);
assert!((sim - 0.0).abs() < f32::EPSILON);
}
#[test]
fn test_matcher_find_best_alignment_identical() {
let fps: Vec<SegmentFingerprint> = (0..6)
.map(|i| SegmentFingerprint::new(i as u64, 10, 100))
.collect();
let matcher = TemporalWindowMatcher::new(3, 1);
let result = matcher.find_best_alignment(&fps, &fps);
assert!(result.is_some());
let (_, _, sim) = result.expect("alignment found");
assert!((sim - 1.0).abs() < 0.001);
}
#[test]
fn test_matcher_find_best_alignment_empty_sequences() {
let fps: Vec<SegmentFingerprint> = vec![];
let matcher = TemporalWindowMatcher::new(4, 2);
assert!(matcher.find_best_alignment(&fps, &fps).is_none());
}
#[test]
fn test_deduplicator_empty() {
let dedup = VideoSegmentDeduplicator::new();
assert_eq!(dedup.video_count(), 0);
assert!(dedup.is_empty());
assert!(dedup.find_duplicate_segments().is_empty());
}
#[test]
fn test_deduplicator_single_video_no_matches() {
let mut dedup = VideoSegmentDeduplicator::new();
let fps = vec![SegmentFingerprint::new(0x1234, 10, 500)];
dedup.add_video("video_a", fps);
assert_eq!(dedup.video_count(), 1);
assert!(dedup.find_duplicate_segments().is_empty());
}
#[test]
fn test_deduplicator_identical_videos_match() {
let mut dedup = VideoSegmentDeduplicator::with_threshold(0.9);
let fps: Vec<SegmentFingerprint> = vec![
SegmentFingerprint::new(0xAAAA_AAAA_AAAA_AAAA, 10, 500),
SegmentFingerprint::new(0xBBBB_BBBB_BBBB_BBBB, 10, 500),
];
dedup.add_video("video_a", fps.clone());
dedup.add_video("video_b", fps);
let matches = dedup.find_duplicate_segments();
assert!(!matches.is_empty());
for m in &matches {
assert!(m.similarity >= 0.9);
}
}
#[test]
fn test_deduplicator_query_without_indexing() {
let mut dedup = VideoSegmentDeduplicator::new();
let fps_indexed = vec![SegmentFingerprint::new(0xFFFF_FFFF_FFFF_FFFF, 10, 500)];
dedup.add_video("indexed", fps_indexed);
let fps_query = vec![SegmentFingerprint::new(0xFFFF_FFFF_FFFF_FFFF, 10, 500)];
let results = dedup.query("query_video", &fps_query, 0.99);
assert_eq!(results.len(), 1);
assert_eq!(results[0].video_a, "query_video");
assert_eq!(results[0].video_b, "indexed");
}
#[test]
fn test_deduplicator_time_offset_computed() {
let mut dedup = VideoSegmentDeduplicator::with_threshold(0.99);
let fps_a = vec![
SegmentFingerprint::new(0x0000_0000_0000_0001, 10, 500),
SegmentFingerprint::new(0xFFFF_FFFF_FFFF_FFFF, 10, 500),
];
let fps_b = vec![SegmentFingerprint::new(0xFFFF_FFFF_FFFF_FFFF, 10, 500)];
dedup.add_video("a", fps_a);
dedup.add_video("b", fps_b);
let matches = dedup.find_duplicate_segments();
assert!(!matches.is_empty(), "expected at least one match");
let m = matches.iter().find(|m| {
(m.video_a == "a" && m.segment_a_idx == 1 && m.video_b == "b" && m.segment_b_idx == 0)
|| (m.video_a == "b" && m.segment_a_idx == 0 && m.video_b == "a" && m.segment_b_idx == 1)
});
assert!(m.is_some(), "expected match between a[1] and b[0]");
let m = m.expect("match found");
assert!(
m.time_offset_ms == -500 || m.time_offset_ms == 500,
"expected ±500ms offset, got {}",
m.time_offset_ms
);
}
}