use crate::error::{CvError, CvResult};
use oxifft::api::{Direction, Flags, Plan};
use oxifft::Complex;
use rayon::prelude::*;
fn complex_zero() -> Complex<f32> {
Complex::new(0.0_f32, 0.0_f32)
}
#[derive(Debug, Clone)]
pub struct AudioFingerprintConfig {
pub sample_rate: u32,
pub frame_size: usize,
pub hop_size: usize,
pub num_bands: usize,
pub min_freq: f32,
pub max_freq: f32,
}
impl Default for AudioFingerprintConfig {
fn default() -> Self {
Self {
sample_rate: 11025,
frame_size: 4096,
hop_size: 1024,
num_bands: 12,
min_freq: 300.0,
max_freq: 5000.0,
}
}
}
#[derive(Debug, Clone)]
pub struct AudioFingerprint {
pub fingerprint: Vec<u32>,
pub duration: f64,
pub sample_rate: u32,
}
impl AudioFingerprint {
#[must_use]
pub fn new(fingerprint: Vec<u32>, duration: f64, sample_rate: u32) -> Self {
Self {
fingerprint,
duration,
sample_rate,
}
}
#[must_use]
pub fn size_bytes(&self) -> usize {
self.fingerprint.len() * std::mem::size_of::<u32>()
}
#[must_use]
pub fn compare(&self, other: &Self) -> f64 {
compare_fingerprints(&self.fingerprint, &other.fingerprint)
}
}
pub fn extract_fingerprint(
samples: &[f32],
sample_rate: u32,
frame_size: usize,
hop_size: usize,
) -> CvResult<Vec<u32>> {
if samples.is_empty() {
return Err(CvError::invalid_parameter("samples", "empty"));
}
if frame_size == 0 || (frame_size & (frame_size - 1)) != 0 {
return Err(CvError::invalid_parameter(
"frame_size",
format!("{frame_size} (must be power of 2)"),
));
}
if hop_size == 0 || hop_size > frame_size {
return Err(CvError::invalid_parameter(
"hop_size",
format!("{hop_size} (must be > 0 and <= frame_size)"),
));
}
let config = AudioFingerprintConfig {
sample_rate,
frame_size,
hop_size,
..Default::default()
};
let spectrogram = compute_spectrogram(samples, &config)?;
let chroma = compute_chromagram(&spectrogram, &config)?;
let fingerprint = quantize_features(&chroma);
Ok(fingerprint)
}
fn compute_spectrogram(
samples: &[f32],
config: &AudioFingerprintConfig,
) -> CvResult<Vec<Vec<f32>>> {
let plan = Plan::dft_1d(config.frame_size, Direction::Forward, Flags::ESTIMATE)
.ok_or_else(|| CvError::detection_failed("FFT plan creation failed"))?;
let num_frames = (samples.len().saturating_sub(config.frame_size)) / config.hop_size + 1;
let mut spectrogram = Vec::with_capacity(num_frames);
let window = create_hanning_window(config.frame_size);
let mut output = vec![complex_zero(); config.frame_size];
for frame_idx in 0..num_frames {
let start = frame_idx * config.hop_size;
let end = start + config.frame_size;
if end > samples.len() {
break;
}
let input: Vec<Complex<f32>> = samples[start..end]
.iter()
.zip(window.iter())
.map(|(&s, &w)| Complex::new(s * w, 0.0_f32))
.collect();
plan.execute(&input, &mut output);
let magnitudes: Vec<f32> = output[..config.frame_size / 2]
.iter()
.map(|c| (c.re * c.re + c.im * c.im).sqrt())
.collect();
spectrogram.push(magnitudes);
}
Ok(spectrogram)
}
fn create_hanning_window(size: usize) -> Vec<f32> {
(0..size)
.map(|i| {
let phase = 2.0 * std::f32::consts::PI * i as f32 / (size - 1) as f32;
0.5 * (1.0 - phase.cos())
})
.collect()
}
fn compute_chromagram(
spectrogram: &[Vec<f32>],
config: &AudioFingerprintConfig,
) -> CvResult<Vec<Vec<f32>>> {
let mut chroma = Vec::with_capacity(spectrogram.len());
let freq_to_band = create_frequency_bands(config);
for frame in spectrogram {
let mut chroma_frame = vec![0.0; config.num_bands];
for (bin_idx, &magnitude) in frame.iter().enumerate() {
let freq = bin_idx as f32 * config.sample_rate as f32 / config.frame_size as f32;
if freq < config.min_freq || freq > config.max_freq {
continue;
}
if let Some(band) = freq_to_band.get(&(freq as u32)) {
chroma_frame[*band] += magnitude;
}
}
let sum: f32 = chroma_frame.iter().sum();
if sum > 0.0 {
for val in &mut chroma_frame {
*val /= sum;
}
}
chroma.push(chroma_frame);
}
Ok(chroma)
}
fn create_frequency_bands(
config: &AudioFingerprintConfig,
) -> std::collections::HashMap<u32, usize> {
let mut mapping = std::collections::HashMap::new();
let freq_range = config.max_freq - config.min_freq;
let band_width = freq_range / config.num_bands as f32;
for freq in (config.min_freq as u32)..=(config.max_freq as u32) {
let band = ((freq as f32 - config.min_freq) / band_width).floor() as usize;
let band = band.min(config.num_bands - 1);
mapping.insert(freq, band);
}
mapping
}
fn quantize_features(chroma: &[Vec<f32>]) -> Vec<u32> {
let mut fingerprint = Vec::new();
if chroma.len() < 2 {
return fingerprint;
}
for i in 0..chroma.len() - 1 {
let mut hash: u32 = 0;
for band in 0..chroma[i].len().min(32) {
let diff = chroma[i + 1][band] - chroma[i][band];
if diff > 0.0 {
hash |= 1u32 << band;
}
}
fingerprint.push(hash);
}
fingerprint
}
#[must_use]
pub fn compare_fingerprints(fp1: &[u32], fp2: &[u32]) -> f64 {
if fp1.is_empty() || fp2.is_empty() {
return 0.0;
}
let max_offset = 100.min(fp1.len().min(fp2.len()) / 2);
let mut best_score = 0.0;
for offset in 0..max_offset {
let score = compare_at_offset(fp1, fp2, offset);
if score > best_score {
best_score = score;
}
}
best_score
}
fn compare_at_offset(fp1: &[u32], fp2: &[u32], offset: usize) -> f64 {
let len = (fp1.len() - offset).min(fp2.len());
if len == 0 {
return 0.0;
}
let mut matching_bits = 0;
let mut total_bits = 0;
for i in 0..len {
let bits1 = fp1[i + offset];
let bits2 = fp2[i];
let matching = (bits1 ^ bits2).count_zeros();
matching_bits += matching;
total_bits += 32;
}
matching_bits as f64 / total_bits as f64
}
#[must_use]
pub fn find_best_offset(fp1: &[u32], fp2: &[u32], threshold: f64) -> Option<(usize, f64)> {
if fp1.is_empty() || fp2.is_empty() {
return None;
}
let max_offset = 200.min(fp1.len().min(fp2.len()) / 2);
let mut best_offset = 0;
let mut best_score = 0.0;
for offset in 0..max_offset {
let score = compare_at_offset(fp1, fp2, offset);
if score > best_score {
best_score = score;
best_offset = offset;
}
}
if best_score >= threshold {
Some((best_offset, best_score))
} else {
None
}
}
pub fn extract_fingerprint_parallel(
samples: &[f32],
sample_rate: u32,
frame_size: usize,
hop_size: usize,
) -> CvResult<Vec<u32>> {
if samples.is_empty() {
return Err(CvError::invalid_parameter("samples", "empty"));
}
let config = AudioFingerprintConfig {
sample_rate,
frame_size,
hop_size,
..Default::default()
};
let num_frames = (samples.len() - frame_size) / hop_size + 1;
let chunk_size = 1000;
let fingerprints: Vec<Vec<u32>> = (0..num_frames)
.step_by(chunk_size)
.collect::<Vec<_>>()
.par_iter()
.filter_map(|&chunk_start| {
let chunk_end = (chunk_start + chunk_size).min(num_frames);
let sample_start = chunk_start * hop_size;
let sample_end = (chunk_end * hop_size + frame_size).min(samples.len());
if sample_end <= sample_start {
return None;
}
let chunk_samples = &samples[sample_start..sample_end];
extract_fingerprint(chunk_samples, sample_rate, frame_size, hop_size).ok()
})
.collect();
let mut result = Vec::new();
for fp in fingerprints {
result.extend(fp);
}
Ok(result)
}
#[must_use]
pub fn search_database(
query: &[u32],
database: &[Vec<u32>],
threshold: f64,
) -> Vec<(usize, f64, usize)> {
database
.iter()
.enumerate()
.filter_map(|(idx, db_fp)| {
if let Some((offset, score)) = find_best_offset(query, db_fp, threshold) {
Some((idx, score, offset))
} else {
None
}
})
.collect()
}
#[must_use]
pub fn hamming_distance(fp1: &[u32], fp2: &[u32]) -> usize {
let len = fp1.len().min(fp2.len());
let mut distance = 0;
for i in 0..len {
distance += (fp1[i] ^ fp2[i]).count_ones() as usize;
}
distance += (fp1.len().abs_diff(fp2.len())) * 32;
distance
}
#[must_use]
pub fn normalize_audio(samples: &[f32]) -> Vec<f32> {
if samples.is_empty() {
return Vec::new();
}
let max_val = samples.iter().map(|&x| x.abs()).fold(0.0f32, f32::max);
if max_val == 0.0 {
return samples.to_vec();
}
samples.iter().map(|&x| x / max_val).collect()
}
#[must_use]
pub fn stereo_to_mono(stereo: &[f32]) -> Vec<f32> {
let mut mono = Vec::with_capacity(stereo.len() / 2);
for chunk in stereo.chunks_exact(2) {
mono.push((chunk[0] + chunk[1]) / 2.0);
}
mono
}
#[must_use]
pub fn downsample(samples: &[f32], from_rate: u32, to_rate: u32) -> Vec<f32> {
if from_rate == to_rate {
return samples.to_vec();
}
let ratio = from_rate as f32 / to_rate as f32;
let output_len = (samples.len() as f32 / ratio) as usize;
let mut output = Vec::with_capacity(output_len);
for i in 0..output_len {
let src_idx = (i as f32 * ratio) as usize;
if src_idx < samples.len() {
output.push(samples[src_idx]);
}
}
output
}
#[must_use]
pub fn high_pass_filter(samples: &[f32], cutoff_freq: f32, sample_rate: u32) -> Vec<f32> {
let rc = 1.0 / (2.0 * std::f32::consts::PI * cutoff_freq);
let dt = 1.0 / sample_rate as f32;
let alpha = rc / (rc + dt);
let mut output = Vec::with_capacity(samples.len());
let mut prev_input = 0.0;
let mut prev_output = 0.0;
for &sample in samples {
let filtered = alpha * (prev_output + sample - prev_input);
output.push(filtered);
prev_input = sample;
prev_output = filtered;
}
output
}
#[cfg(test)]
mod tests {
use super::*;
fn create_test_audio(duration_secs: f32, sample_rate: u32, freq: f32) -> Vec<f32> {
let num_samples = (duration_secs * sample_rate as f32) as usize;
(0..num_samples)
.map(|i| {
let t = i as f32 / sample_rate as f32;
(2.0 * std::f32::consts::PI * freq * t).sin()
})
.collect()
}
#[test]
fn test_extract_fingerprint() {
let samples = create_test_audio(1.0, 11025, 440.0);
let fp = extract_fingerprint(&samples, 11025, 4096, 1024)
.expect("extract_fingerprint should succeed");
assert!(!fp.is_empty());
}
#[test]
fn test_config_default() {
let config = AudioFingerprintConfig::default();
assert_eq!(config.sample_rate, 11025);
assert_eq!(config.frame_size, 4096);
assert_eq!(config.hop_size, 1024);
}
#[test]
fn test_hanning_window() {
let window = create_hanning_window(512);
assert_eq!(window.len(), 512);
assert!(window[0] < 0.1);
assert!(window[256] > 0.9);
}
#[test]
fn test_compare_fingerprints() {
let fp1 = vec![0x12345678, 0x9ABCDEF0];
let fp2 = vec![0x12345678, 0x9ABCDEF0];
let score = compare_fingerprints(&fp1, &fp2);
assert!(score > 0.95);
}
#[test]
fn test_compare_different_fingerprints() {
let fp1 = vec![0xFFFFFFFF; 10];
let fp2 = vec![0x00000000; 10];
let score = compare_fingerprints(&fp1, &fp2);
assert!(score < 0.1);
}
#[test]
fn test_find_best_offset() {
let fp1 = vec![1, 2, 3, 4, 5, 6, 7, 8];
let fp2 = vec![0, 0, 1, 2, 3, 4, 5, 6];
let result = find_best_offset(&fp1, &fp2, 0.5);
assert!(result.is_some());
}
#[test]
fn test_hamming_distance() {
let fp1 = vec![0, 0, 0];
let fp2 = vec![0, 0, 0];
assert_eq!(hamming_distance(&fp1, &fp2), 0);
let fp3 = vec![0xFFFFFFFF; 3];
let distance = hamming_distance(&fp1, &fp3);
assert_eq!(distance, 96); }
#[test]
fn test_normalize_audio() {
let samples = vec![-2.0, -1.0, 0.0, 1.0, 2.0];
let normalized = normalize_audio(&samples);
assert_eq!(normalized.len(), 5);
assert!(normalized.iter().all(|&x| x >= -1.0 && x <= 1.0));
}
#[test]
fn test_stereo_to_mono() {
let stereo = vec![1.0, 0.0, 0.5, 0.5, -1.0, 1.0];
let mono = stereo_to_mono(&stereo);
assert_eq!(mono.len(), 3);
assert_eq!(mono[0], 0.5);
assert_eq!(mono[1], 0.5);
assert_eq!(mono[2], 0.0);
}
#[test]
fn test_downsample() {
let samples = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let downsampled = downsample(&samples, 8000, 4000);
assert!(downsampled.len() >= 3 && downsampled.len() <= 5);
}
#[test]
fn test_high_pass_filter() {
let samples = create_test_audio(0.1, 11025, 440.0);
let filtered = high_pass_filter(&samples, 100.0, 11025);
assert_eq!(filtered.len(), samples.len());
}
#[test]
fn test_empty_samples() {
let samples: Vec<f32> = Vec::new();
assert!(extract_fingerprint(&samples, 11025, 4096, 1024).is_err());
}
#[test]
fn test_invalid_frame_size() {
let samples = create_test_audio(1.0, 11025, 440.0);
assert!(extract_fingerprint(&samples, 11025, 100, 50).is_err());
}
#[test]
fn test_search_database() {
let query = vec![1, 2, 3, 4, 5];
let database = vec![
vec![1, 2, 3, 4, 5],
vec![6, 7, 8, 9, 10],
vec![1, 2, 3, 4, 6],
];
let results = search_database(&query, &database, 0.8);
assert!(!results.is_empty());
}
#[test]
fn test_quantize_features() {
let chroma = vec![
vec![0.1, 0.2, 0.3],
vec![0.2, 0.3, 0.4],
vec![0.3, 0.4, 0.5],
];
let fp = quantize_features(&chroma);
assert_eq!(fp.len(), 2); }
#[test]
fn test_audio_fingerprint() {
let fp = AudioFingerprint::new(vec![1, 2, 3], 10.0, 11025);
assert_eq!(fp.fingerprint.len(), 3);
assert_eq!(fp.duration, 10.0);
assert_eq!(fp.sample_rate, 11025);
assert_eq!(fp.size_bytes(), 12);
}
}