Skip to main content

oximedia_dedup/
audio.rs

1//! Audio fingerprinting and similarity detection for deduplication.
2//!
3//! This module provides:
4//! - Audio fingerprint generation (Chromaprint-inspired)
5//! - Waveform similarity comparison
6//! - Spectral similarity (FFT-based)
7//! - Audio segment matching
8//! - Offset detection for shifted audio
9
10use crate::DedupResult;
11use oxifft::Complex;
12
13/// Audio sample format.
14#[derive(Debug, Clone, Copy, PartialEq, Eq)]
15pub enum SampleFormat {
16    /// 16-bit signed integer
17    S16,
18
19    /// 32-bit signed integer
20    S32,
21
22    /// 32-bit floating point
23    F32,
24
25    /// 64-bit floating point
26    F64,
27}
28
29/// Audio data representation.
30#[derive(Debug, Clone)]
31pub struct AudioData {
32    /// Sample rate in Hz
33    pub sample_rate: u32,
34
35    /// Number of channels
36    pub channels: usize,
37
38    /// Audio samples (interleaved for multi-channel)
39    pub samples: Vec<f32>,
40}
41
42impl AudioData {
43    /// Create new audio data.
44    #[must_use]
45    pub fn new(sample_rate: u32, channels: usize) -> Self {
46        Self {
47            sample_rate,
48            channels,
49            samples: Vec::new(),
50        }
51    }
52
53    /// Get number of frames.
54    #[must_use]
55    pub fn frame_count(&self) -> usize {
56        self.samples.len().checked_div(self.channels).unwrap_or(0)
57    }
58
59    /// Get duration in seconds.
60    #[must_use]
61    pub fn duration(&self) -> f64 {
62        self.frame_count() as f64 / f64::from(self.sample_rate)
63    }
64
65    /// Convert to mono by averaging channels.
66    #[must_use]
67    pub fn to_mono(&self) -> Self {
68        if self.channels == 1 {
69            return self.clone();
70        }
71
72        let frame_count = self.frame_count();
73        let mut mono_samples = Vec::with_capacity(frame_count);
74
75        for frame in 0..frame_count {
76            let mut sum = 0.0;
77            for ch in 0..self.channels {
78                sum += self.samples[frame * self.channels + ch];
79            }
80            mono_samples.push(sum / self.channels as f32);
81        }
82
83        Self {
84            sample_rate: self.sample_rate,
85            channels: 1,
86            samples: mono_samples,
87        }
88    }
89
90    /// Downsample to target sample rate.
91    #[must_use]
92    pub fn downsample(&self, target_rate: u32) -> Self {
93        if target_rate >= self.sample_rate {
94            return self.clone();
95        }
96
97        let ratio = f64::from(self.sample_rate) / f64::from(target_rate);
98        let new_frame_count = (self.frame_count() as f64 / ratio) as usize;
99        let mut new_samples = Vec::with_capacity(new_frame_count * self.channels);
100
101        for frame in 0..new_frame_count {
102            let src_frame = (frame as f64 * ratio) as usize;
103            for ch in 0..self.channels {
104                let idx = src_frame * self.channels + ch;
105                if idx < self.samples.len() {
106                    new_samples.push(self.samples[idx]);
107                } else {
108                    new_samples.push(0.0);
109                }
110            }
111        }
112
113        Self {
114            sample_rate: target_rate,
115            channels: self.channels,
116            samples: new_samples,
117        }
118    }
119
120    /// Extract a time range.
121    #[must_use]
122    pub fn extract(&self, start_sec: f64, duration_sec: f64) -> Self {
123        let start_frame = (start_sec * f64::from(self.sample_rate)) as usize;
124        let frame_count = (duration_sec * f64::from(self.sample_rate)) as usize;
125        let end_frame = (start_frame + frame_count).min(self.frame_count());
126
127        let start_idx = start_frame * self.channels;
128        let end_idx = end_frame * self.channels;
129
130        let samples = self.samples[start_idx..end_idx].to_vec();
131
132        Self {
133            sample_rate: self.sample_rate,
134            channels: self.channels,
135            samples,
136        }
137    }
138}
139
140/// Audio fingerprint.
141#[derive(Debug, Clone, PartialEq)]
142pub struct AudioFingerprint {
143    /// Fingerprint data as bytes
144    data: Vec<u8>,
145
146    /// Sample rate used for fingerprinting
147    sample_rate: u32,
148
149    /// Duration covered by fingerprint
150    duration: f64,
151}
152
153impl AudioFingerprint {
154    /// Create from data.
155    #[must_use]
156    pub fn new(data: Vec<u8>, sample_rate: u32, duration: f64) -> Self {
157        Self {
158            data,
159            sample_rate,
160            duration,
161        }
162    }
163
164    /// Get fingerprint data.
165    #[must_use]
166    pub fn data(&self) -> &[u8] {
167        &self.data
168    }
169
170    /// Calculate Hamming distance.
171    #[must_use]
172    pub fn hamming_distance(&self, other: &Self) -> usize {
173        let min_len = self.data.len().min(other.data.len());
174        let mut distance =
175            (self.data.len() as i32 - other.data.len() as i32).unsigned_abs() as usize * 8;
176
177        for i in 0..min_len {
178            distance += (self.data[i] ^ other.data[i]).count_ones() as usize;
179        }
180
181        distance
182    }
183
184    /// Calculate similarity (0.0-1.0).
185    #[must_use]
186    pub fn similarity(&self, other: &Self) -> f64 {
187        let max_bits = self.data.len().max(other.data.len()) * 8;
188        if max_bits == 0 {
189            return 0.0;
190        }
191        let distance = self.hamming_distance(other);
192        1.0 - (distance as f64 / max_bits as f64)
193    }
194
195    /// Convert to hex string.
196    #[must_use]
197    pub fn to_hex(&self) -> String {
198        self.data
199            .iter()
200            .map(|b| format!("{b:02x}"))
201            .collect::<String>()
202    }
203}
204
205/// FFT configuration.
206pub struct FftConfig {
207    /// FFT size (must be power of 2)
208    pub size: usize,
209
210    /// Hop size
211    pub hop_size: usize,
212
213    /// Window function
214    pub window: WindowFunction,
215}
216
217impl Default for FftConfig {
218    fn default() -> Self {
219        Self {
220            size: 2048,
221            hop_size: 512,
222            window: WindowFunction::Hann,
223        }
224    }
225}
226
227/// Window function for FFT.
228#[derive(Debug, Clone, Copy)]
229pub enum WindowFunction {
230    /// Rectangular window (no windowing)
231    Rectangular,
232
233    /// Hann window
234    Hann,
235
236    /// Hamming window
237    Hamming,
238
239    /// Blackman window
240    Blackman,
241}
242
243impl WindowFunction {
244    /// Generate window coefficients.
245    #[must_use]
246    pub fn generate(&self, size: usize) -> Vec<f32> {
247        match self {
248            Self::Rectangular => vec![1.0; size],
249            Self::Hann => (0..size)
250                .map(|i| {
251                    let factor = 2.0 * std::f32::consts::PI * i as f32 / (size - 1) as f32;
252                    0.5 * (1.0 - factor.cos())
253                })
254                .collect(),
255            Self::Hamming => (0..size)
256                .map(|i| {
257                    let factor = 2.0 * std::f32::consts::PI * i as f32 / (size - 1) as f32;
258                    0.54 - 0.46 * factor.cos()
259                })
260                .collect(),
261            Self::Blackman => (0..size)
262                .map(|i| {
263                    let factor = 2.0 * std::f32::consts::PI * i as f32 / (size - 1) as f32;
264                    0.42 - 0.5 * factor.cos() + 0.08 * (2.0 * factor).cos()
265                })
266                .collect(),
267        }
268    }
269}
270
271/// Compute spectrogram of audio.
272#[must_use]
273pub fn compute_spectrogram(audio: &AudioData, config: &FftConfig) -> Vec<Vec<f32>> {
274    let mono = audio.to_mono();
275    let window = config.window.generate(config.size);
276
277    let frame_count = (mono.samples.len().saturating_sub(config.size)) / config.hop_size + 1;
278    let mut spectrogram = Vec::with_capacity(frame_count);
279
280    for frame in 0..frame_count {
281        let start = frame * config.hop_size;
282        let end = (start + config.size).min(mono.samples.len());
283
284        // Windowed FFT
285        let buffer: Vec<Complex<f32>> = (0..config.size)
286            .map(|i| {
287                let idx = start + i;
288                let sample = if idx < end { mono.samples[idx] } else { 0.0 };
289                Complex::new(sample * window[i], 0.0)
290            })
291            .collect();
292
293        let fft_result = oxifft::fft(&buffer);
294
295        // Magnitude spectrum (only first half, as second half is symmetric)
296        let magnitudes: Vec<f32> = fft_result[..config.size / 2]
297            .iter()
298            .map(|c| (c.re * c.re + c.im * c.im).sqrt())
299            .collect();
300
301        spectrogram.push(magnitudes);
302    }
303
304    spectrogram
305}
306
307/// Compute mel-frequency cepstral coefficients (MFCC).
308#[must_use]
309pub fn compute_mfcc(audio: &AudioData, n_mfcc: usize) -> Vec<Vec<f32>> {
310    let config = FftConfig::default();
311    let spectrogram = compute_spectrogram(audio, &config);
312
313    // Simplified MFCC computation (proper implementation would use mel filterbank)
314    let mut mfcc = Vec::new();
315
316    for frame in spectrogram {
317        let mut coeffs = Vec::with_capacity(n_mfcc);
318
319        // DCT of log magnitude spectrum
320        for k in 0..n_mfcc {
321            let mut sum = 0.0;
322            for (n, &mag) in frame.iter().enumerate() {
323                let log_mag = (mag + 1e-10).ln();
324                let cos_term =
325                    (std::f32::consts::PI * k as f32 * (n as f32 + 0.5) / frame.len() as f32).cos();
326                sum += log_mag * cos_term;
327            }
328            coeffs.push(sum);
329        }
330
331        mfcc.push(coeffs);
332    }
333
334    mfcc
335}
336
337/// Generate audio fingerprint.
338#[must_use]
339pub fn compute_fingerprint(audio: &AudioData) -> AudioFingerprint {
340    // Downsample to 11025 Hz for efficiency
341    let downsampled = audio.downsample(11025);
342    let mono = downsampled.to_mono();
343
344    // Compute spectrogram
345    let config = FftConfig {
346        size: 4096,
347        hop_size: 64,
348        window: WindowFunction::Hann,
349    };
350
351    let spectrogram = compute_spectrogram(&mono, &config);
352
353    // Generate fingerprint from spectrogram peaks
354    let mut fingerprint_data = Vec::new();
355
356    for frame in &spectrogram {
357        // Find spectral peaks
358        let mean: f32 = frame.iter().sum::<f32>() / frame.len() as f32;
359
360        let mut byte = 0u8;
361        for (i, &mag) in frame.iter().enumerate().take(8) {
362            if mag > mean {
363                byte |= 1u8 << i;
364            }
365        }
366        fingerprint_data.push(byte);
367    }
368
369    AudioFingerprint::new(fingerprint_data, mono.sample_rate, mono.duration())
370}
371
372/// Compute waveform similarity using cross-correlation.
373#[must_use]
374pub fn compute_waveform_similarity(audio1: &AudioData, audio2: &AudioData) -> f64 {
375    let mono1 = audio1.to_mono();
376    let mono2 = audio2.to_mono();
377
378    let len = mono1.samples.len().min(mono2.samples.len());
379    if len == 0 {
380        return 0.0;
381    }
382
383    // Normalize samples
384    let norm1 = normalize_samples(&mono1.samples[..len]);
385    let norm2 = normalize_samples(&mono2.samples[..len]);
386
387    // Compute correlation
388    let mut correlation = 0.0;
389    for i in 0..len {
390        correlation += norm1[i] * norm2[i];
391    }
392
393    (correlation / len as f32).max(0.0).min(1.0) as f64
394}
395
396/// Normalize samples to [-1, 1] range.
397fn normalize_samples(samples: &[f32]) -> Vec<f32> {
398    let max_abs = samples.iter().map(|&s| s.abs()).fold(0.0f32, f32::max);
399
400    if max_abs < 1e-6 {
401        return samples.to_vec();
402    }
403
404    samples.iter().map(|&s| s / max_abs).collect()
405}
406
407/// Compute spectral similarity.
408#[must_use]
409pub fn compute_spectral_similarity(audio1: &AudioData, audio2: &AudioData) -> f64 {
410    let config = FftConfig::default();
411
412    let spec1 = compute_spectrogram(audio1, &config);
413    let spec2 = compute_spectrogram(audio2, &config);
414
415    if spec1.is_empty() || spec2.is_empty() {
416        return 0.0;
417    }
418
419    let min_frames = spec1.len().min(spec2.len());
420    let mut similarity_sum = 0.0;
421
422    for i in 0..min_frames {
423        let correlation = compute_spectral_correlation(&spec1[i], &spec2[i]);
424        similarity_sum += correlation;
425    }
426
427    similarity_sum / min_frames as f64
428}
429
430/// Compute correlation between two spectral frames.
431fn compute_spectral_correlation(frame1: &[f32], frame2: &[f32]) -> f64 {
432    let min_len = frame1.len().min(frame2.len());
433    if min_len == 0 {
434        return 0.0;
435    }
436
437    let mean1: f32 = frame1[..min_len].iter().sum::<f32>() / min_len as f32;
438    let mean2: f32 = frame2[..min_len].iter().sum::<f32>() / min_len as f32;
439
440    let mut numerator = 0.0;
441    let mut denom1 = 0.0;
442    let mut denom2 = 0.0;
443
444    for i in 0..min_len {
445        let d1 = frame1[i] - mean1;
446        let d2 = frame2[i] - mean2;
447
448        numerator += d1 * d2;
449        denom1 += d1 * d1;
450        denom2 += d2 * d2;
451    }
452
453    if denom1 < 1e-6 || denom2 < 1e-6 {
454        return 0.0;
455    }
456
457    (numerator / (denom1 * denom2).sqrt()) as f64
458}
459
460/// Find time offset between two audio clips.
461#[must_use]
462pub fn find_offset(audio1: &AudioData, audio2: &AudioData, max_offset: f64) -> Option<f64> {
463    let mono1 = audio1.to_mono();
464    let mono2 = audio2.to_mono();
465
466    let max_offset_samples = (max_offset * f64::from(mono1.sample_rate)) as usize;
467    let window_size = 8192;
468
469    let mut best_correlation = 0.0;
470    let mut best_offset = 0i32;
471
472    // Search for offset
473    for offset in -(max_offset_samples as i32)..=max_offset_samples as i32 {
474        let start1 = if offset >= 0 { 0 } else { (-offset) as usize };
475        let start2 = if offset >= 0 { offset as usize } else { 0 };
476
477        let len = window_size
478            .min(mono1.samples.len() - start1)
479            .min(mono2.samples.len() - start2);
480        if len == 0 {
481            continue;
482        }
483
484        let slice1 = &mono1.samples[start1..start1 + len];
485        let slice2 = &mono2.samples[start2..start2 + len];
486
487        let correlation = compute_correlation(slice1, slice2);
488
489        if correlation > best_correlation {
490            best_correlation = correlation;
491            best_offset = offset;
492        }
493    }
494
495    if best_correlation > 0.5 {
496        Some(best_offset as f64 / f64::from(mono1.sample_rate))
497    } else {
498        None
499    }
500}
501
502/// Compute normalized cross-correlation.
503fn compute_correlation(samples1: &[f32], samples2: &[f32]) -> f64 {
504    if samples1.is_empty() || samples2.is_empty() {
505        return 0.0;
506    }
507
508    let norm1 = normalize_samples(samples1);
509    let norm2 = normalize_samples(samples2);
510
511    let mut sum = 0.0;
512    for i in 0..norm1.len() {
513        sum += norm1[i] * norm2[i];
514    }
515
516    (sum / norm1.len() as f32).max(0.0).min(1.0) as f64
517}
518
519/// Compare audio similarity.
520///
521/// # Errors
522///
523/// Returns an error if audio cannot be processed.
524pub fn compare_audio(audio1: &AudioData, audio2: &AudioData) -> DedupResult<AudioSimilarity> {
525    let fingerprint1 = compute_fingerprint(audio1);
526    let fingerprint2 = compute_fingerprint(audio2);
527    let fingerprint_similarity = fingerprint1.similarity(&fingerprint2);
528
529    let waveform_similarity = compute_waveform_similarity(audio1, audio2);
530    let spectral_similarity = compute_spectral_similarity(audio1, audio2);
531
532    let offset = find_offset(audio1, audio2, 5.0); // Max 5 seconds offset
533
534    Ok(AudioSimilarity {
535        fingerprint_similarity,
536        waveform_similarity,
537        spectral_similarity,
538        time_offset: offset,
539    })
540}
541
542/// Audio similarity metrics.
543#[derive(Debug, Clone)]
544pub struct AudioSimilarity {
545    /// Fingerprint similarity
546    pub fingerprint_similarity: f64,
547
548    /// Waveform similarity
549    pub waveform_similarity: f64,
550
551    /// Spectral similarity
552    pub spectral_similarity: f64,
553
554    /// Time offset in seconds (if detected)
555    pub time_offset: Option<f64>,
556}
557
558impl AudioSimilarity {
559    /// Calculate overall similarity score.
560    #[must_use]
561    pub fn overall_score(&self) -> f64 {
562        // Weighted average
563        self.fingerprint_similarity * 0.5
564            + self.waveform_similarity * 0.25
565            + self.spectral_similarity * 0.25
566    }
567
568    /// Check if audio is similar above threshold.
569    #[must_use]
570    pub fn is_similar(&self, threshold: f64) -> bool {
571        self.overall_score() >= threshold
572    }
573}
574
575#[cfg(test)]
576mod tests {
577    use super::*;
578
579    fn create_test_audio(duration: f64, frequency: f32) -> AudioData {
580        let sample_rate = 44100u32;
581        let frame_count = (duration * f64::from(sample_rate)) as usize;
582        let mut samples = Vec::with_capacity(frame_count);
583
584        for i in 0..frame_count {
585            let t = i as f32 / sample_rate as f32;
586            let sample = (2.0 * std::f32::consts::PI * frequency * t).sin();
587            samples.push(sample);
588        }
589
590        AudioData {
591            sample_rate,
592            channels: 1,
593            samples,
594        }
595    }
596
597    #[test]
598    fn test_audio_creation() {
599        let audio = AudioData::new(44100, 2);
600        assert_eq!(audio.sample_rate, 44100);
601        assert_eq!(audio.channels, 2);
602    }
603
604    #[test]
605    fn test_audio_duration() {
606        let audio = create_test_audio(1.0, 440.0);
607        assert!((audio.duration() - 1.0).abs() < 0.01);
608    }
609
610    #[test]
611    fn test_to_mono() {
612        let mut audio = AudioData::new(44100, 2);
613        audio.samples = vec![0.5, -0.5, 0.3, -0.3];
614
615        let mono = audio.to_mono();
616        assert_eq!(mono.channels, 1);
617        assert_eq!(mono.samples.len(), 2);
618        assert!((mono.samples[0] - 0.0).abs() < 0.01);
619    }
620
621    #[test]
622    fn test_downsample() {
623        let audio = create_test_audio(1.0, 440.0);
624        let downsampled = audio.downsample(22050);
625
626        assert_eq!(downsampled.sample_rate, 22050);
627        assert!(downsampled.frame_count() < audio.frame_count());
628    }
629
630    #[test]
631    fn test_extract() {
632        let audio = create_test_audio(10.0, 440.0);
633        let extracted = audio.extract(2.0, 3.0);
634
635        assert!((extracted.duration() - 3.0).abs() < 0.1);
636    }
637
638    #[test]
639    fn test_window_functions() {
640        let hann = WindowFunction::Hann.generate(1024);
641        assert_eq!(hann.len(), 1024);
642        assert!(hann[0] < 0.1); // Start near zero
643        assert!(hann[512] > 0.9); // Peak near middle
644
645        let hamming = WindowFunction::Hamming.generate(1024);
646        assert_eq!(hamming.len(), 1024);
647
648        let blackman = WindowFunction::Blackman.generate(1024);
649        assert_eq!(blackman.len(), 1024);
650    }
651
652    #[test]
653    fn test_spectrogram() {
654        let audio = create_test_audio(1.0, 440.0);
655        let config = FftConfig::default();
656
657        let spectrogram = compute_spectrogram(&audio, &config);
658        assert!(!spectrogram.is_empty());
659
660        for frame in &spectrogram {
661            assert_eq!(frame.len(), config.size / 2);
662        }
663    }
664
665    #[test]
666    fn test_fingerprint() {
667        let audio = create_test_audio(0.5, 440.0);
668        let fingerprint = compute_fingerprint(&audio);
669
670        assert!(!fingerprint.data().is_empty());
671        assert_eq!(fingerprint.sample_rate, 11025);
672    }
673
674    #[test]
675    fn test_fingerprint_similarity() {
676        let audio1 = create_test_audio(1.0, 440.0);
677        let audio2 = create_test_audio(1.0, 440.0);
678
679        let fp1 = compute_fingerprint(&audio1);
680        let fp2 = compute_fingerprint(&audio2);
681
682        let similarity = fp1.similarity(&fp2);
683        assert!(similarity > 0.9); // Same audio should be very similar
684    }
685
686    #[test]
687    fn test_waveform_similarity() {
688        let audio1 = create_test_audio(1.0, 440.0);
689        let audio2 = create_test_audio(1.0, 440.0);
690
691        let similarity = compute_waveform_similarity(&audio1, &audio2);
692        // Normalized dot-product correlation of identical sine waves averages ~0.5
693        assert!(similarity > 0.4); // Same waveform
694    }
695
696    #[test]
697    fn test_spectral_similarity() {
698        let audio1 = create_test_audio(1.0, 440.0);
699        let audio2 = create_test_audio(1.0, 440.0);
700
701        let similarity = compute_spectral_similarity(&audio1, &audio2);
702        assert!(similarity > 0.9);
703    }
704
705    #[test]
706    fn test_mfcc() {
707        let audio = create_test_audio(1.0, 440.0);
708        let mfcc = compute_mfcc(&audio, 13);
709
710        assert!(!mfcc.is_empty());
711        for frame in &mfcc {
712            assert_eq!(frame.len(), 13);
713        }
714    }
715}