songrec/fingerprinting/
algorithm.rs

1use chfft::RFft1D;
2use std::error::Error;
3use std::io::BufReader;
4use std::collections::HashMap;
5
6use crate::fingerprinting::hanning::HANNING_WINDOW_2048_MULTIPLIERS;
7use crate::fingerprinting::signature_format::{DecodedSignature, FrequencyBand, FrequencyPeak};
8
9
10pub struct SignatureGenerator {
11
12    // Used when processing input:
13
14    ring_buffer_of_samples: Vec<i16>,
15    /// Ring buffer.
16    ring_buffer_of_samples_index: usize,
17
18    reordered_ring_buffer_of_samples: Vec<f32>,
19    /// Reordered, temporary version of the ring buffer above, with floats for precision because we applied Hanning window.
20
21    fft_outputs: Vec<Vec<f32>>,
22    /// Ring buffer. Lists of 1025 floats, premultiplied with a Hanning function before being passed through FFT, computed from the ring buffer every new 128 samples
23    fft_outputs_index: usize,
24
25    fft_object: RFft1D<f32>,
26
27    spread_fft_outputs: Vec<Vec<f32>>,
28    /// Ring buffer.
29    spread_fft_outputs_index: usize,
30
31    num_spread_ffts_done: u32,
32
33    signature: DecodedSignature,
34}
35
36impl SignatureGenerator {
37    pub fn make_signature_from_file(file_path: &str) -> Result<DecodedSignature, Box<dyn Error>> {
38        // Check if file exists
39        if !std::path::Path::new(file_path).exists() {
40            return Err(format!("File not found: {}", file_path).into());
41        }
42
43        // Decode the .WAV, .MP3, .OGG or .FLAC file
44        let file = std::fs::File::open(file_path)
45            .map_err(|e| format!("Failed to open file '{}': {}", file_path, e))?;
46        
47        let decoder = rodio::Decoder::new(BufReader::new(file))
48            .map_err(|e| format!("Failed to decode audio file '{}': {}. Note: M4A/AAC format may not be fully supported on all platforms.", file_path, e))?;
49        
50        // Downsample the raw PCM samples to 16 KHz, and skip to the middle of the file
51        // in order to increase recognition odds. Take 12 seconds of sample.
52
53        let converted_file = rodio::source::UniformSourceIterator::new(decoder, 1, 16000);
54
55        let raw_pcm_samples: Vec<i16> = converted_file.collect();
56        
57        // Check if we got any samples
58        if raw_pcm_samples.is_empty() {
59            return Err(format!("No audio samples could be extracted from file '{}'. The file may be corrupted or in an unsupported format.", file_path).into());
60        }
61
62        let mut raw_pcm_samples_slice: &[i16] = &raw_pcm_samples;
63
64        let slice_len = raw_pcm_samples_slice.len().min(12 * 16000);
65        
66        // Check if we have enough samples for fingerprinting (at least 3 seconds)
67        if slice_len < 3 * 16000 {
68            return Err(format!("Audio file '{}' is too short for fingerprinting. Need at least 3 seconds of audio, but only got {:.2} seconds.", 
69                file_path, slice_len as f32 / 16000.0).into());
70        }
71
72        if raw_pcm_samples_slice.len() > 12 * 16000 {
73            let middle = raw_pcm_samples.len() / 2;
74
75            raw_pcm_samples_slice = &raw_pcm_samples_slice[middle - (6 * 16000)..middle + (6 * 16000)];
76        }
77
78        Ok(SignatureGenerator::make_signature_from_buffer(&raw_pcm_samples_slice[..slice_len]))
79    }
80
81    pub fn make_signature_from_buffer(s16_mono_16khz_buffer: &[i16]) -> DecodedSignature {
82        let mut this = SignatureGenerator {
83            ring_buffer_of_samples: vec![0i16; 2048],
84            ring_buffer_of_samples_index: 0,
85
86            reordered_ring_buffer_of_samples: vec![0.0f32; 2048],
87
88            fft_outputs: vec![vec![0.0f32; 1025]; 256],
89            fft_outputs_index: 0,
90
91            fft_object: RFft1D::<f32>::new(2048),
92
93            spread_fft_outputs: vec![vec![0.0f32; 1025]; 256],
94            spread_fft_outputs_index: 0,
95
96            num_spread_ffts_done: 0,
97
98            signature: DecodedSignature {
99                sample_rate_hz: 16000,
100                number_samples: s16_mono_16khz_buffer.len() as u32,
101                frequency_band_to_sound_peaks: HashMap::new(),
102            },
103        };        for chunk in s16_mono_16khz_buffer.chunks_exact(128) {
104            this.do_fft_internal(chunk);
105
106            this.do_peak_spreading();
107
108            this.num_spread_ffts_done += 1;
109
110            if this.num_spread_ffts_done >= 46 {
111                this.do_peak_recognition();
112            }
113        }
114
115        this.signature
116    }
117
118    /// Create a new SignatureGenerator instance for streaming recognition
119    pub fn new() -> Self {
120        Self {
121            ring_buffer_of_samples: vec![0i16; 2048],
122            ring_buffer_of_samples_index: 0,
123            reordered_ring_buffer_of_samples: vec![0.0f32; 2048],
124            fft_outputs: vec![vec![0.0f32; 1025]; 256],
125            fft_outputs_index: 0,
126            fft_object: RFft1D::<f32>::new(2048),
127            spread_fft_outputs: vec![vec![0.0f32; 1025]; 256],
128            spread_fft_outputs_index: 0,
129            num_spread_ffts_done: 0,
130            signature: DecodedSignature {
131                sample_rate_hz: 16000,
132                number_samples: 0,
133                frequency_band_to_sound_peaks: HashMap::new(),
134            },
135        }
136    }
137
138    /// Process audio samples and update the signature
139    /// This is a public version of do_fft that also updates sample count
140    pub fn do_fft(&mut self, s16_mono_16khz_buffer: &[i16], sample_rate: u32) {
141        // Update sample count
142        self.signature.number_samples += s16_mono_16khz_buffer.len() as u32;
143        self.signature.sample_rate_hz = sample_rate;
144
145        // Call the internal FFT processing
146        self.do_fft_internal(s16_mono_16khz_buffer);
147        
148        self.do_peak_spreading();
149        self.num_spread_ffts_done += 1;
150
151        if self.num_spread_ffts_done >= 46 {
152            self.do_peak_recognition();
153        }
154    }
155
156    /// Get the current signature
157    pub fn get_signature(&self) -> DecodedSignature {
158        self.signature.clone()
159    }
160
161    fn do_fft_internal(&mut self, s16_mono_16khz_buffer: &[i16]) {
162
163        // Copy the 128 input s16le samples to the local ring buffer
164
165        self.ring_buffer_of_samples[self.ring_buffer_of_samples_index..self.ring_buffer_of_samples_index + 128].copy_from_slice(s16_mono_16khz_buffer);
166
167        self.ring_buffer_of_samples_index += 128;
168        self.ring_buffer_of_samples_index &= 2047;
169
170        // Reorder the items (put the latest data at end) and apply Hanning window
171
172        for (index, multiplier) in HANNING_WINDOW_2048_MULTIPLIERS.iter().enumerate() {
173            self.reordered_ring_buffer_of_samples[index] =
174                self.ring_buffer_of_samples[(index + self.ring_buffer_of_samples_index) & 2047] as f32 *
175                    multiplier;
176        }
177
178        // Perform Fast Fourier transform
179
180        let complex_fft_results = self.fft_object.forward(&self.reordered_ring_buffer_of_samples);
181
182        assert_eq!(complex_fft_results.len(), 1025);
183
184        // Turn complex into reals, and put the results into a local array
185
186        let real_fft_results = &mut self.fft_outputs[self.fft_outputs_index];
187
188        for index in 0..=1024 {
189            real_fft_results[index] = (
190                (
191                    complex_fft_results[index].re.powi(2) +
192                        complex_fft_results[index].im.powi(2)
193                ) / ((1 << 17) as f32)
194            ).max(0.0000000001);
195        }
196
197        self.fft_outputs_index += 1;
198        self.fft_outputs_index &= 255;
199    }
200
201    fn do_peak_spreading(&mut self) {
202        let real_fft_results = &self.fft_outputs[((self.fft_outputs_index as i32 - 1) & 255) as usize];
203
204        let spread_fft_results = &mut self.spread_fft_outputs[self.spread_fft_outputs_index];
205
206        // Perform frequency-domain spreading of peak values
207
208        spread_fft_results.copy_from_slice(real_fft_results);
209
210        for position in 0..=1022 {
211            spread_fft_results[position] = spread_fft_results[position]
212                .max(spread_fft_results[position + 1])
213                .max(spread_fft_results[position + 2]);
214        }
215
216        // Perform time-domain spreading of peak values
217
218        let spread_fft_results_copy = spread_fft_results.clone(); // Avoid mutable+mutable borrow of self.spread_fft_outputs
219
220        for position in 0..=1024 {
221            for former_fft_number in &[1, 3, 6] {
222                let former_fft_output = &mut self.spread_fft_outputs[((self.spread_fft_outputs_index as i32 - *former_fft_number) & 255) as usize];
223
224                former_fft_output[position] = former_fft_output[position]
225                    .max(spread_fft_results_copy[position]);
226            }
227        }
228
229        self.spread_fft_outputs_index += 1;
230        self.spread_fft_outputs_index &= 255;
231    }
232
233    fn do_peak_recognition(&mut self) {
234
235        // Note: when substracting an array index, casting to signed is needed
236        // to avoid underflow panics at runtime.
237
238        let fft_minus_46 = &self.fft_outputs[((self.fft_outputs_index as i32 - 46) & 255) as usize];
239        let fft_minus_49 = &self.spread_fft_outputs[((self.spread_fft_outputs_index as i32 - 49) & 255) as usize];
240
241        for bin_position in 10..=1014 {
242
243            // Ensure that the bin is large enough to be a peak
244
245            if fft_minus_46[bin_position] >= 1.0 / 64.0 &&
246                fft_minus_46[bin_position] >= fft_minus_49[bin_position - 1] {
247
248                // Ensure that it is frequency-domain local minimum
249
250                let mut max_neighbor_in_fft_minus_49: f32 = 0.0;
251
252                for neighbor_offset in &[-10, -7, -4, -3, 1, 2, 5, 8] {
253                    max_neighbor_in_fft_minus_49 = max_neighbor_in_fft_minus_49
254                        .max(fft_minus_49[(bin_position as i32 + *neighbor_offset) as usize]);
255                }
256
257                if fft_minus_46[bin_position] > max_neighbor_in_fft_minus_49 {
258
259                    // Ensure that it is a time-domain local minimum
260
261                    let mut max_neighbor_in_other_adjacent_ffts = max_neighbor_in_fft_minus_49;
262
263                    for other_offset in &[-53, -45,
264                        165, 172, 179, 186, 193, 200,
265                        214, 221, 228, 235, 242, 249] {
266                        let other_fft = &self.spread_fft_outputs[((self.spread_fft_outputs_index as i32 + other_offset) & 255) as usize];
267
268                        max_neighbor_in_other_adjacent_ffts = max_neighbor_in_other_adjacent_ffts
269                            .max(other_fft[bin_position - 1]);
270                    }
271
272                    if fft_minus_46[bin_position] > max_neighbor_in_other_adjacent_ffts {
273
274                        // This is a peak, store the peak
275
276                        let fft_pass_number = self.num_spread_ffts_done - 46;
277
278                        let peak_magnitude: f32 = fft_minus_46[bin_position].ln().max(1.0 / 64.0) * 1477.3 + 6144.0;
279                        let peak_magnitude_before: f32 = fft_minus_46[bin_position - 1].ln().max(1.0 / 64.0) * 1477.3 + 6144.0;
280                        let peak_magnitude_after: f32 = fft_minus_46[bin_position + 1].ln().max(1.0 / 64.0) * 1477.3 + 6144.0;
281
282                        let peak_variation_1: f32 = peak_magnitude * 2.0 - peak_magnitude_before - peak_magnitude_after;
283                        let peak_variation_2: f32 = (peak_magnitude_after - peak_magnitude_before) * 32.0 / peak_variation_1;
284
285                        let corrected_peak_frequency_bin: u16 = ((bin_position as i32 * 64) + (peak_variation_2 as i32)) as u16;
286
287                        assert!(peak_variation_1 >= 0.0);
288
289                        // Convert back a FFT bin to a frequency, given a 16 KHz sample
290                        // rate, 1024 useful bins and the multiplication by 64 made before
291                        // storing the information
292
293                        let frequency_hz: f32 = corrected_peak_frequency_bin as f32 * (16000.0 / 2.0 / 1024.0 / 64.0);
294
295                        // Ignore peaks outside the 250 Hz-5.5 KHz range, store them into
296                        // a lookup table that will be used to generate the binary fingerprint
297                        // otherwise
298
299                        let frequency_band = match frequency_hz as i32 {
300                            250..=519 => FrequencyBand::_250_520,
301                            520..=1449 => FrequencyBand::_520_1450,
302                            1450..=3499 => FrequencyBand::_1450_3500,
303                            3500..=5500 => FrequencyBand::_3500_5500,
304                            _ => { continue; }
305                        };
306
307                        // In Rust, the entry method returns an Entry object,
308                        // which represents a cell in a HashMap that is either occupied or vacant.
309                        // You can use or_default to insert a value if the key is missing,
310                        // which avoids a double search of the key in the hash map.
311                        self.signature.frequency_band_to_sound_peaks
312                            .entry(frequency_band)
313                            .or_default();
314
315                        self.signature.frequency_band_to_sound_peaks.get_mut(&frequency_band).unwrap().push(
316                            FrequencyPeak {
317                                fft_pass_number,
318                                peak_magnitude: peak_magnitude as u16,
319                                corrected_peak_frequency_bin
320                            }
321                        );
322                    }
323                }
324            }
325        }
326    }
327}