1use crate::DedupResult;
11use oxifft::Complex;
12
13#[derive(Debug, Clone, Copy, PartialEq, Eq)]
15pub enum SampleFormat {
16 S16,
18
19 S32,
21
22 F32,
24
25 F64,
27}
28
29#[derive(Debug, Clone)]
31pub struct AudioData {
32 pub sample_rate: u32,
34
35 pub channels: usize,
37
38 pub samples: Vec<f32>,
40}
41
42impl AudioData {
43 #[must_use]
45 pub fn new(sample_rate: u32, channels: usize) -> Self {
46 Self {
47 sample_rate,
48 channels,
49 samples: Vec::new(),
50 }
51 }
52
53 #[must_use]
55 pub fn frame_count(&self) -> usize {
56 self.samples.len().checked_div(self.channels).unwrap_or(0)
57 }
58
59 #[must_use]
61 pub fn duration(&self) -> f64 {
62 self.frame_count() as f64 / f64::from(self.sample_rate)
63 }
64
65 #[must_use]
67 pub fn to_mono(&self) -> Self {
68 if self.channels == 1 {
69 return self.clone();
70 }
71
72 let frame_count = self.frame_count();
73 let mut mono_samples = Vec::with_capacity(frame_count);
74
75 for frame in 0..frame_count {
76 let mut sum = 0.0;
77 for ch in 0..self.channels {
78 sum += self.samples[frame * self.channels + ch];
79 }
80 mono_samples.push(sum / self.channels as f32);
81 }
82
83 Self {
84 sample_rate: self.sample_rate,
85 channels: 1,
86 samples: mono_samples,
87 }
88 }
89
90 #[must_use]
92 pub fn downsample(&self, target_rate: u32) -> Self {
93 if target_rate >= self.sample_rate {
94 return self.clone();
95 }
96
97 let ratio = f64::from(self.sample_rate) / f64::from(target_rate);
98 let new_frame_count = (self.frame_count() as f64 / ratio) as usize;
99 let mut new_samples = Vec::with_capacity(new_frame_count * self.channels);
100
101 for frame in 0..new_frame_count {
102 let src_frame = (frame as f64 * ratio) as usize;
103 for ch in 0..self.channels {
104 let idx = src_frame * self.channels + ch;
105 if idx < self.samples.len() {
106 new_samples.push(self.samples[idx]);
107 } else {
108 new_samples.push(0.0);
109 }
110 }
111 }
112
113 Self {
114 sample_rate: target_rate,
115 channels: self.channels,
116 samples: new_samples,
117 }
118 }
119
120 #[must_use]
122 pub fn extract(&self, start_sec: f64, duration_sec: f64) -> Self {
123 let start_frame = (start_sec * f64::from(self.sample_rate)) as usize;
124 let frame_count = (duration_sec * f64::from(self.sample_rate)) as usize;
125 let end_frame = (start_frame + frame_count).min(self.frame_count());
126
127 let start_idx = start_frame * self.channels;
128 let end_idx = end_frame * self.channels;
129
130 let samples = self.samples[start_idx..end_idx].to_vec();
131
132 Self {
133 sample_rate: self.sample_rate,
134 channels: self.channels,
135 samples,
136 }
137 }
138}
139
140#[derive(Debug, Clone, PartialEq)]
142pub struct AudioFingerprint {
143 data: Vec<u8>,
145
146 sample_rate: u32,
148
149 duration: f64,
151}
152
153impl AudioFingerprint {
154 #[must_use]
156 pub fn new(data: Vec<u8>, sample_rate: u32, duration: f64) -> Self {
157 Self {
158 data,
159 sample_rate,
160 duration,
161 }
162 }
163
164 #[must_use]
166 pub fn data(&self) -> &[u8] {
167 &self.data
168 }
169
170 #[must_use]
172 pub fn hamming_distance(&self, other: &Self) -> usize {
173 let min_len = self.data.len().min(other.data.len());
174 let mut distance =
175 (self.data.len() as i32 - other.data.len() as i32).unsigned_abs() as usize * 8;
176
177 for i in 0..min_len {
178 distance += (self.data[i] ^ other.data[i]).count_ones() as usize;
179 }
180
181 distance
182 }
183
184 #[must_use]
186 pub fn similarity(&self, other: &Self) -> f64 {
187 let max_bits = self.data.len().max(other.data.len()) * 8;
188 if max_bits == 0 {
189 return 0.0;
190 }
191 let distance = self.hamming_distance(other);
192 1.0 - (distance as f64 / max_bits as f64)
193 }
194
195 #[must_use]
197 pub fn to_hex(&self) -> String {
198 self.data
199 .iter()
200 .map(|b| format!("{b:02x}"))
201 .collect::<String>()
202 }
203}
204
205pub struct FftConfig {
207 pub size: usize,
209
210 pub hop_size: usize,
212
213 pub window: WindowFunction,
215}
216
217impl Default for FftConfig {
218 fn default() -> Self {
219 Self {
220 size: 2048,
221 hop_size: 512,
222 window: WindowFunction::Hann,
223 }
224 }
225}
226
227#[derive(Debug, Clone, Copy)]
229pub enum WindowFunction {
230 Rectangular,
232
233 Hann,
235
236 Hamming,
238
239 Blackman,
241}
242
243impl WindowFunction {
244 #[must_use]
246 pub fn generate(&self, size: usize) -> Vec<f32> {
247 match self {
248 Self::Rectangular => vec![1.0; size],
249 Self::Hann => (0..size)
250 .map(|i| {
251 let factor = 2.0 * std::f32::consts::PI * i as f32 / (size - 1) as f32;
252 0.5 * (1.0 - factor.cos())
253 })
254 .collect(),
255 Self::Hamming => (0..size)
256 .map(|i| {
257 let factor = 2.0 * std::f32::consts::PI * i as f32 / (size - 1) as f32;
258 0.54 - 0.46 * factor.cos()
259 })
260 .collect(),
261 Self::Blackman => (0..size)
262 .map(|i| {
263 let factor = 2.0 * std::f32::consts::PI * i as f32 / (size - 1) as f32;
264 0.42 - 0.5 * factor.cos() + 0.08 * (2.0 * factor).cos()
265 })
266 .collect(),
267 }
268 }
269}
270
271#[must_use]
273pub fn compute_spectrogram(audio: &AudioData, config: &FftConfig) -> Vec<Vec<f32>> {
274 let mono = audio.to_mono();
275 let window = config.window.generate(config.size);
276
277 let frame_count = (mono.samples.len().saturating_sub(config.size)) / config.hop_size + 1;
278 let mut spectrogram = Vec::with_capacity(frame_count);
279
280 for frame in 0..frame_count {
281 let start = frame * config.hop_size;
282 let end = (start + config.size).min(mono.samples.len());
283
284 let buffer: Vec<Complex<f32>> = (0..config.size)
286 .map(|i| {
287 let idx = start + i;
288 let sample = if idx < end { mono.samples[idx] } else { 0.0 };
289 Complex::new(sample * window[i], 0.0)
290 })
291 .collect();
292
293 let fft_result = oxifft::fft(&buffer);
294
295 let magnitudes: Vec<f32> = fft_result[..config.size / 2]
297 .iter()
298 .map(|c| (c.re * c.re + c.im * c.im).sqrt())
299 .collect();
300
301 spectrogram.push(magnitudes);
302 }
303
304 spectrogram
305}
306
307#[must_use]
309pub fn compute_mfcc(audio: &AudioData, n_mfcc: usize) -> Vec<Vec<f32>> {
310 let config = FftConfig::default();
311 let spectrogram = compute_spectrogram(audio, &config);
312
313 let mut mfcc = Vec::new();
315
316 for frame in spectrogram {
317 let mut coeffs = Vec::with_capacity(n_mfcc);
318
319 for k in 0..n_mfcc {
321 let mut sum = 0.0;
322 for (n, &mag) in frame.iter().enumerate() {
323 let log_mag = (mag + 1e-10).ln();
324 let cos_term =
325 (std::f32::consts::PI * k as f32 * (n as f32 + 0.5) / frame.len() as f32).cos();
326 sum += log_mag * cos_term;
327 }
328 coeffs.push(sum);
329 }
330
331 mfcc.push(coeffs);
332 }
333
334 mfcc
335}
336
337#[must_use]
339pub fn compute_fingerprint(audio: &AudioData) -> AudioFingerprint {
340 let downsampled = audio.downsample(11025);
342 let mono = downsampled.to_mono();
343
344 let config = FftConfig {
346 size: 4096,
347 hop_size: 64,
348 window: WindowFunction::Hann,
349 };
350
351 let spectrogram = compute_spectrogram(&mono, &config);
352
353 let mut fingerprint_data = Vec::new();
355
356 for frame in &spectrogram {
357 let mean: f32 = frame.iter().sum::<f32>() / frame.len() as f32;
359
360 let mut byte = 0u8;
361 for (i, &mag) in frame.iter().enumerate().take(8) {
362 if mag > mean {
363 byte |= 1u8 << i;
364 }
365 }
366 fingerprint_data.push(byte);
367 }
368
369 AudioFingerprint::new(fingerprint_data, mono.sample_rate, mono.duration())
370}
371
372#[must_use]
374pub fn compute_waveform_similarity(audio1: &AudioData, audio2: &AudioData) -> f64 {
375 let mono1 = audio1.to_mono();
376 let mono2 = audio2.to_mono();
377
378 let len = mono1.samples.len().min(mono2.samples.len());
379 if len == 0 {
380 return 0.0;
381 }
382
383 let norm1 = normalize_samples(&mono1.samples[..len]);
385 let norm2 = normalize_samples(&mono2.samples[..len]);
386
387 let mut correlation = 0.0;
389 for i in 0..len {
390 correlation += norm1[i] * norm2[i];
391 }
392
393 (correlation / len as f32).max(0.0).min(1.0) as f64
394}
395
396fn normalize_samples(samples: &[f32]) -> Vec<f32> {
398 let max_abs = samples.iter().map(|&s| s.abs()).fold(0.0f32, f32::max);
399
400 if max_abs < 1e-6 {
401 return samples.to_vec();
402 }
403
404 samples.iter().map(|&s| s / max_abs).collect()
405}
406
407#[must_use]
409pub fn compute_spectral_similarity(audio1: &AudioData, audio2: &AudioData) -> f64 {
410 let config = FftConfig::default();
411
412 let spec1 = compute_spectrogram(audio1, &config);
413 let spec2 = compute_spectrogram(audio2, &config);
414
415 if spec1.is_empty() || spec2.is_empty() {
416 return 0.0;
417 }
418
419 let min_frames = spec1.len().min(spec2.len());
420 let mut similarity_sum = 0.0;
421
422 for i in 0..min_frames {
423 let correlation = compute_spectral_correlation(&spec1[i], &spec2[i]);
424 similarity_sum += correlation;
425 }
426
427 similarity_sum / min_frames as f64
428}
429
430fn compute_spectral_correlation(frame1: &[f32], frame2: &[f32]) -> f64 {
432 let min_len = frame1.len().min(frame2.len());
433 if min_len == 0 {
434 return 0.0;
435 }
436
437 let mean1: f32 = frame1[..min_len].iter().sum::<f32>() / min_len as f32;
438 let mean2: f32 = frame2[..min_len].iter().sum::<f32>() / min_len as f32;
439
440 let mut numerator = 0.0;
441 let mut denom1 = 0.0;
442 let mut denom2 = 0.0;
443
444 for i in 0..min_len {
445 let d1 = frame1[i] - mean1;
446 let d2 = frame2[i] - mean2;
447
448 numerator += d1 * d2;
449 denom1 += d1 * d1;
450 denom2 += d2 * d2;
451 }
452
453 if denom1 < 1e-6 || denom2 < 1e-6 {
454 return 0.0;
455 }
456
457 (numerator / (denom1 * denom2).sqrt()) as f64
458}
459
460#[must_use]
462pub fn find_offset(audio1: &AudioData, audio2: &AudioData, max_offset: f64) -> Option<f64> {
463 let mono1 = audio1.to_mono();
464 let mono2 = audio2.to_mono();
465
466 let max_offset_samples = (max_offset * f64::from(mono1.sample_rate)) as usize;
467 let window_size = 8192;
468
469 let mut best_correlation = 0.0;
470 let mut best_offset = 0i32;
471
472 for offset in -(max_offset_samples as i32)..=max_offset_samples as i32 {
474 let start1 = if offset >= 0 { 0 } else { (-offset) as usize };
475 let start2 = if offset >= 0 { offset as usize } else { 0 };
476
477 let len = window_size
478 .min(mono1.samples.len() - start1)
479 .min(mono2.samples.len() - start2);
480 if len == 0 {
481 continue;
482 }
483
484 let slice1 = &mono1.samples[start1..start1 + len];
485 let slice2 = &mono2.samples[start2..start2 + len];
486
487 let correlation = compute_correlation(slice1, slice2);
488
489 if correlation > best_correlation {
490 best_correlation = correlation;
491 best_offset = offset;
492 }
493 }
494
495 if best_correlation > 0.5 {
496 Some(best_offset as f64 / f64::from(mono1.sample_rate))
497 } else {
498 None
499 }
500}
501
502fn compute_correlation(samples1: &[f32], samples2: &[f32]) -> f64 {
504 if samples1.is_empty() || samples2.is_empty() {
505 return 0.0;
506 }
507
508 let norm1 = normalize_samples(samples1);
509 let norm2 = normalize_samples(samples2);
510
511 let mut sum = 0.0;
512 for i in 0..norm1.len() {
513 sum += norm1[i] * norm2[i];
514 }
515
516 (sum / norm1.len() as f32).max(0.0).min(1.0) as f64
517}
518
519pub fn compare_audio(audio1: &AudioData, audio2: &AudioData) -> DedupResult<AudioSimilarity> {
525 let fingerprint1 = compute_fingerprint(audio1);
526 let fingerprint2 = compute_fingerprint(audio2);
527 let fingerprint_similarity = fingerprint1.similarity(&fingerprint2);
528
529 let waveform_similarity = compute_waveform_similarity(audio1, audio2);
530 let spectral_similarity = compute_spectral_similarity(audio1, audio2);
531
532 let offset = find_offset(audio1, audio2, 5.0); Ok(AudioSimilarity {
535 fingerprint_similarity,
536 waveform_similarity,
537 spectral_similarity,
538 time_offset: offset,
539 })
540}
541
542#[derive(Debug, Clone)]
544pub struct AudioSimilarity {
545 pub fingerprint_similarity: f64,
547
548 pub waveform_similarity: f64,
550
551 pub spectral_similarity: f64,
553
554 pub time_offset: Option<f64>,
556}
557
558impl AudioSimilarity {
559 #[must_use]
561 pub fn overall_score(&self) -> f64 {
562 self.fingerprint_similarity * 0.5
564 + self.waveform_similarity * 0.25
565 + self.spectral_similarity * 0.25
566 }
567
568 #[must_use]
570 pub fn is_similar(&self, threshold: f64) -> bool {
571 self.overall_score() >= threshold
572 }
573}
574
575#[cfg(test)]
576mod tests {
577 use super::*;
578
579 fn create_test_audio(duration: f64, frequency: f32) -> AudioData {
580 let sample_rate = 44100u32;
581 let frame_count = (duration * f64::from(sample_rate)) as usize;
582 let mut samples = Vec::with_capacity(frame_count);
583
584 for i in 0..frame_count {
585 let t = i as f32 / sample_rate as f32;
586 let sample = (2.0 * std::f32::consts::PI * frequency * t).sin();
587 samples.push(sample);
588 }
589
590 AudioData {
591 sample_rate,
592 channels: 1,
593 samples,
594 }
595 }
596
597 #[test]
598 fn test_audio_creation() {
599 let audio = AudioData::new(44100, 2);
600 assert_eq!(audio.sample_rate, 44100);
601 assert_eq!(audio.channels, 2);
602 }
603
604 #[test]
605 fn test_audio_duration() {
606 let audio = create_test_audio(1.0, 440.0);
607 assert!((audio.duration() - 1.0).abs() < 0.01);
608 }
609
610 #[test]
611 fn test_to_mono() {
612 let mut audio = AudioData::new(44100, 2);
613 audio.samples = vec![0.5, -0.5, 0.3, -0.3];
614
615 let mono = audio.to_mono();
616 assert_eq!(mono.channels, 1);
617 assert_eq!(mono.samples.len(), 2);
618 assert!((mono.samples[0] - 0.0).abs() < 0.01);
619 }
620
621 #[test]
622 fn test_downsample() {
623 let audio = create_test_audio(1.0, 440.0);
624 let downsampled = audio.downsample(22050);
625
626 assert_eq!(downsampled.sample_rate, 22050);
627 assert!(downsampled.frame_count() < audio.frame_count());
628 }
629
630 #[test]
631 fn test_extract() {
632 let audio = create_test_audio(10.0, 440.0);
633 let extracted = audio.extract(2.0, 3.0);
634
635 assert!((extracted.duration() - 3.0).abs() < 0.1);
636 }
637
638 #[test]
639 fn test_window_functions() {
640 let hann = WindowFunction::Hann.generate(1024);
641 assert_eq!(hann.len(), 1024);
642 assert!(hann[0] < 0.1); assert!(hann[512] > 0.9); let hamming = WindowFunction::Hamming.generate(1024);
646 assert_eq!(hamming.len(), 1024);
647
648 let blackman = WindowFunction::Blackman.generate(1024);
649 assert_eq!(blackman.len(), 1024);
650 }
651
652 #[test]
653 fn test_spectrogram() {
654 let audio = create_test_audio(1.0, 440.0);
655 let config = FftConfig::default();
656
657 let spectrogram = compute_spectrogram(&audio, &config);
658 assert!(!spectrogram.is_empty());
659
660 for frame in &spectrogram {
661 assert_eq!(frame.len(), config.size / 2);
662 }
663 }
664
665 #[test]
666 fn test_fingerprint() {
667 let audio = create_test_audio(0.5, 440.0);
668 let fingerprint = compute_fingerprint(&audio);
669
670 assert!(!fingerprint.data().is_empty());
671 assert_eq!(fingerprint.sample_rate, 11025);
672 }
673
674 #[test]
675 fn test_fingerprint_similarity() {
676 let audio1 = create_test_audio(1.0, 440.0);
677 let audio2 = create_test_audio(1.0, 440.0);
678
679 let fp1 = compute_fingerprint(&audio1);
680 let fp2 = compute_fingerprint(&audio2);
681
682 let similarity = fp1.similarity(&fp2);
683 assert!(similarity > 0.9); }
685
686 #[test]
687 fn test_waveform_similarity() {
688 let audio1 = create_test_audio(1.0, 440.0);
689 let audio2 = create_test_audio(1.0, 440.0);
690
691 let similarity = compute_waveform_similarity(&audio1, &audio2);
692 assert!(similarity > 0.4); }
695
696 #[test]
697 fn test_spectral_similarity() {
698 let audio1 = create_test_audio(1.0, 440.0);
699 let audio2 = create_test_audio(1.0, 440.0);
700
701 let similarity = compute_spectral_similarity(&audio1, &audio2);
702 assert!(similarity > 0.9);
703 }
704
705 #[test]
706 fn test_mfcc() {
707 let audio = create_test_audio(1.0, 440.0);
708 let mfcc = compute_mfcc(&audio, 13);
709
710 assert!(!mfcc.is_empty());
711 for frame in &mfcc {
712 assert_eq!(frame.len(), 13);
713 }
714 }
715}