1use image::{Rgba, RgbaImage};
2use memmap2::Mmap;
3use mime::Mime;
4use rustfft::num_complex::Complex;
5use rustfft::num_traits::Zero;
6use rustfft::{Fft, FftPlanner};
7use semdiff_core::fs::FileLeaf;
8use semdiff_core::{Diff, DiffCalculator, MayUnsupported};
9use std::cell::RefCell;
10use std::f32::consts::PI;
11use std::fmt::{Debug, Formatter};
12use std::io::Cursor;
13use std::ops::Range;
14use std::sync::{Arc, LazyLock};
15use std::{convert, iter, mem};
16use symphonia::core::audio::AudioSpec;
17use symphonia::core::codecs::audio::AudioDecoderOptions;
18use symphonia::core::errors::Error as SymphoniaError;
19use symphonia::core::formats::probe::Hint;
20use symphonia::core::formats::{FormatOptions, TrackType};
21use symphonia::core::io::MediaSourceStream;
22use symphonia::core::meta::MetadataOptions;
23use thiserror::Error;
24
25pub mod report_html;
26pub mod report_json;
27pub mod report_summary;
28
29#[cfg(test)]
30mod tests;
31
32const WAVEFORM_WIDTH: u32 = 1024;
33const WAVEFORM_HEIGHT: u32 = 256;
34const SPECTROGRAM_WIDTH: u32 = 1024;
35const SPECTROGRAM_HEIGHT: u32 = 256;
36const SPECTROGRAM_DATA_HEIGHT: usize = 1024;
37const FFT_WINDOW_SIZE: usize = SPECTROGRAM_DATA_HEIGHT * 2;
38const LOG_EPSILON: f32 = 1e-6;
39
40pub struct AudioDiffReporter {
41 spectrogram_analyzer: SpectrogramAnalyzer,
42}
43
44impl Default for AudioDiffReporter {
45 fn default() -> Self {
46 Self::new()
47 }
48}
49
50impl AudioDiffReporter {
51 pub fn new() -> AudioDiffReporter {
52 AudioDiffReporter {
53 spectrogram_analyzer: SpectrogramAnalyzer::new(),
54 }
55 }
56
57 fn build_audio_data(&self, kind: Mime, content: Arc<Mmap>) -> Result<AudioData, AudioDecodeError> {
58 let decoded = self.spectrogram_analyzer.decode_audio(&kind, &content)?;
59 let stat = AudioStat::from_one(&decoded);
60 Ok(build_audio_data_from_decoded(kind, content, &decoded, &stat))
61 }
62}
63
64#[derive(Debug)]
65pub enum AudioDiffStatus {
66 Equal(AudioDiffDetail),
67 Different(AudioDiffDetail),
68 Incomparable,
69}
70
71impl AudioDiffStatus {
72 pub fn as_str(&self) -> &'static str {
73 match self {
74 AudioDiffStatus::Equal(_) => "equal",
75 AudioDiffStatus::Different(_) => "different",
76 AudioDiffStatus::Incomparable => "incomparable",
77 }
78 }
79}
80
81#[derive(Debug)]
82pub struct AudioDiff {
83 status: AudioDiffStatus,
84 expected: AudioData,
85 actual: AudioData,
86}
87
88impl Diff for AudioDiff {
89 fn equal(&self) -> bool {
90 matches!(self.status, AudioDiffStatus::Equal(_))
91 }
92}
93
94impl AudioDiff {
95 fn status(&self) -> &AudioDiffStatus {
96 &self.status
97 }
98
99 fn expected(&self) -> &AudioData {
100 &self.expected
101 }
102
103 fn actual(&self) -> &AudioData {
104 &self.actual
105 }
106
107 fn diff_detail(&self) -> Option<&AudioDiffDetail> {
108 match &self.status {
109 AudioDiffStatus::Equal(detail) | AudioDiffStatus::Different(detail) => Some(detail),
110 AudioDiffStatus::Incomparable => None,
111 }
112 }
113}
114
115#[derive(Debug)]
116pub struct AudioDiffDetail {
117 spectrogram_diff: Vec<RgbaImage>,
118 stat: AudioDiffStat,
119}
120
121impl AudioDiffDetail {
122 fn spectrogram_diff(&self) -> &[RgbaImage] {
123 &self.spectrogram_diff
124 }
125
126 fn stat(&self) -> &AudioDiffStat {
127 &self.stat
128 }
129}
130
131#[derive(Debug, Clone, Copy)]
132pub struct AudioDiffStat {
133 pub spectrogram_diff_rate: f64,
134 pub shift_samples: i32,
135 pub lufs_diff_db: f32,
136}
137
138#[derive(Debug)]
139pub struct AudioData {
140 mime: Mime,
141 sample_rate: u32,
142 channels: u16,
143 duration_seconds: f32,
144 waveform: Vec<RgbaImage>,
145 spectrogram: Vec<RgbaImage>,
146 content: Arc<Mmap>,
147}
148
149impl AudioData {
150 fn mime(&self) -> &Mime {
151 &self.mime
152 }
153
154 fn sample_rate(&self) -> u32 {
155 self.sample_rate
156 }
157
158 fn channels(&self) -> u16 {
159 self.channels
160 }
161
162 fn duration_seconds(&self) -> f32 {
163 self.duration_seconds
164 }
165
166 fn waveform(&self) -> &[RgbaImage] {
167 &self.waveform
168 }
169
170 fn spectrogram(&self) -> &[RgbaImage] {
171 &self.spectrogram
172 }
173
174 fn content(&self) -> &[u8] {
175 &self.content
176 }
177}
178
179#[derive(Debug, Error)]
180pub enum AudioDecodeError {
181 #[error("symphonia error: {0}")]
182 Symphonia(#[from] SymphoniaError),
183 #[error("no default audio track")]
184 NoDefaultTrack,
185 #[error("missing audio codec parameters")]
186 MissingAudioCodecParameters,
187 #[error("missing sample rate")]
188 MissingSampleRate,
189}
190
191#[derive(Default)]
192pub struct AudioDiffCalculator {
193 shift_tolerance_seconds: f32,
194 lufs_tolerance_db: f32,
195 spectral_tolerance: f32,
196 spectrogram_diff_rate_tolerance: f64,
197 spectrogram_analyzer: SpectrogramAnalyzer,
198}
199
200impl Debug for AudioDiffCalculator {
201 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
202 f.debug_struct("AudioDiffCalculator")
203 .field("shift_tolerance_seconds", &self.shift_tolerance_seconds)
204 .field("lufs_tolerance_db", &self.lufs_tolerance_db)
205 .field("spectral_tolerance", &self.spectral_tolerance)
206 .field("spectrogram_diff_rate_tolerance", &self.spectrogram_diff_rate_tolerance)
207 .finish()
208 }
209}
210
211impl AudioDiffCalculator {
212 fn diff_decoded(&self, expected: &AudioDecoded, actual: &AudioDecoded) -> AudioDiffStatus {
213 if (expected.sample_rate, expected.channels) != (actual.sample_rate, actual.channels) {
214 return AudioDiffStatus::Incomparable;
215 }
216
217 let sample_rate = expected.sample_rate;
218 let max_shift_samples = (self.shift_tolerance_seconds * sample_rate as f32).round() as u32;
219 let (aligned_expected, aligned_actual, shift_samples) =
220 align_samples(expected.samples.clone(), actual.samples.clone(), max_shift_samples);
221
222 let expected_spectrogram = aligned_expected
223 .iter()
224 .map(|channel| self.spectrogram_analyzer.compute(channel))
225 .collect::<Vec<_>>();
226 let actual_spectrogram = aligned_actual
227 .iter()
228 .map(|channel| self.spectrogram_analyzer.compute(channel))
229 .collect::<Vec<_>>();
230
231 let (spectrogram_diff, spectrogram_diff_rate) =
232 self.build_diff_images(&expected_spectrogram, &actual_spectrogram);
233
234 let lufs_diff_db = summarize_channel_metrics(&aligned_expected, &aligned_actual);
235
236 let detail = AudioDiffDetail {
237 spectrogram_diff,
238 stat: AudioDiffStat {
239 spectrogram_diff_rate,
240 shift_samples,
241 lufs_diff_db,
242 },
243 };
244
245 let equal =
246 lufs_diff_db <= self.lufs_tolerance_db && spectrogram_diff_rate <= self.spectrogram_diff_rate_tolerance;
247 if equal {
248 AudioDiffStatus::Equal(detail)
249 } else {
250 AudioDiffStatus::Different(detail)
251 }
252 }
253}
254
255impl DiffCalculator<FileLeaf> for AudioDiffCalculator {
256 type Error = convert::Infallible;
257 type Diff = AudioDiff;
258
259 fn diff(
260 &self,
261 _name: &str,
262 expected: FileLeaf,
263 actual: FileLeaf,
264 ) -> Result<MayUnsupported<Self::Diff>, Self::Error> {
265 if !is_audio_kind(&expected.kind) || !is_audio_kind(&actual.kind) {
266 return Ok(MayUnsupported::Unsupported);
267 }
268 let Ok(expected_decoded) = self
269 .spectrogram_analyzer
270 .decode_audio(&expected.kind, expected.content.as_ref())
271 else {
272 return Ok(MayUnsupported::Unsupported);
273 };
274 let Ok(actual_decoded) = self
275 .spectrogram_analyzer
276 .decode_audio(&actual.kind, actual.content.as_ref())
277 else {
278 return Ok(MayUnsupported::Unsupported);
279 };
280 let stat_decoded = AudioStat::from_pair(&expected_decoded, &actual_decoded);
281 let expected_data =
282 build_audio_data_from_decoded(expected.kind, expected.content, &expected_decoded, &stat_decoded);
283 let actual_data = build_audio_data_from_decoded(actual.kind, actual.content, &actual_decoded, &stat_decoded);
284 let status = self.diff_decoded(&expected_decoded, &actual_decoded);
285
286 Ok(MayUnsupported::Ok(AudioDiff {
287 status,
288 expected: expected_data,
289 actual: actual_data,
290 }))
291 }
292}
293
294#[derive(Debug)]
295struct AudioStat {
296 signal_max: f32,
297 spectrogram_min: f32,
298 spectrogram_max: f32,
299 duration: f32,
300}
301
302impl AudioStat {
303 fn from_one(decoded: &AudioDecoded) -> AudioStat {
304 let signal_max = decoded
305 .samples
306 .iter()
307 .flatten()
308 .copied()
309 .map(f32::abs)
310 .fold(0.0, f32::max);
311 let (spectrogram_min, spectrogram_max) = decoded
312 .spectrograms
313 .iter()
314 .flatten()
315 .flatten()
316 .copied()
317 .fold((f32::INFINITY, f32::NEG_INFINITY), |(min, max), v| {
318 (v.min(min), v.max(max))
319 });
320 let duration = decoded.duration_seconds;
321 AudioStat {
322 signal_max,
323 spectrogram_min,
324 spectrogram_max,
325 duration,
326 }
327 }
328
329 fn from_pair(expected: &AudioDecoded, actual: &AudioDecoded) -> AudioStat {
330 let signal_max = expected
331 .samples
332 .iter()
333 .chain(actual.samples.iter())
334 .flatten()
335 .copied()
336 .map(f32::abs)
337 .fold(0.0, f32::max);
338 let (spectrogram_min, spectrogram_max) = expected
339 .spectrograms
340 .iter()
341 .chain(actual.spectrograms.iter())
342 .flatten()
343 .flatten()
344 .copied()
345 .filter(|v| v.is_finite())
346 .fold((f32::INFINITY, f32::NEG_INFINITY), |(min, max), v| {
347 (v.min(min), v.max(max))
348 });
349 let duration = expected.duration_seconds.max(actual.duration_seconds);
350 AudioStat {
351 signal_max,
352 spectrogram_min,
353 spectrogram_max,
354 duration,
355 }
356 }
357}
358
359impl AudioDiffCalculator {
360 pub fn new(
361 shift_tolerance_seconds: f32,
362 lufs_tolerance_db: f32,
363 spectral_tolerance: f32,
364 spectrogram_diff_rate_tolerance: f64,
365 ) -> Self {
366 Self {
367 shift_tolerance_seconds,
368 lufs_tolerance_db,
369 spectral_tolerance,
370 spectrogram_diff_rate_tolerance,
371 spectrogram_analyzer: SpectrogramAnalyzer::new(),
372 }
373 }
374
375 fn build_diff_images(
376 &self,
377 expected: &[Vec<[f32; SPECTROGRAM_DATA_HEIGHT]>],
378 actual: &[Vec<[f32; SPECTROGRAM_DATA_HEIGHT]>],
379 ) -> (Vec<RgbaImage>, f64) {
380 assert_eq!(expected.len(), actual.len());
381 let mut diff_images = Vec::with_capacity(expected.len());
382 let mut diff_rate_sum = 0.0;
383 for (expected_frame, actual_frame) in expected.iter().zip(actual.iter()) {
384 let (diff_image, diff_rate) = self.diff_spectrograms(expected_frame, actual_frame);
385 diff_images.push(diff_image);
386 diff_rate_sum += diff_rate;
387 }
388 (diff_images, diff_rate_sum / expected.len() as f64)
389 }
390
391 fn diff_spectrograms(
392 &self,
393 expected: &[[f32; SPECTROGRAM_DATA_HEIGHT]],
394 actual: &[[f32; SPECTROGRAM_DATA_HEIGHT]],
395 ) -> (RgbaImage, f64) {
396 let spectrogram_len = expected.len().max(actual.len());
397 let mut diff_image = RgbaImage::from_pixel(SPECTROGRAM_WIDTH, SPECTROGRAM_HEIGHT, Rgba([255, 255, 255, 0]));
398 let mut diff_count = 0usize;
399 let mut total_count = 0usize;
400 assert!(SPECTROGRAM_DATA_HEIGHT >= SPECTROGRAM_HEIGHT as usize);
401 if spectrogram_len >= SPECTROGRAM_WIDTH as usize {
402 for x in 0..SPECTROGRAM_WIDTH {
403 let x_range = x as usize * spectrogram_len / SPECTROGRAM_WIDTH as usize
404 ..(x + 1) as usize * spectrogram_len / SPECTROGRAM_WIDTH as usize;
405 for y in 0..SPECTROGRAM_HEIGHT {
406 let y_range = spectrogram_log_bin_range(y);
407
408 let mut diff_sum = 0usize;
409 for y in y_range.clone() {
410 for x in x_range.clone() {
411 let expected = expected.get(x).map(|x| x[y]);
412 let actual = actual.get(x).map(|x| x[y]);
413 let diff = (expected.unwrap_or(f32::INFINITY) - actual.unwrap_or(f32::NEG_INFINITY)).abs();
414 total_count += 1;
415 if diff > self.spectral_tolerance {
416 diff_sum += 1;
417 diff_count += 1;
418 }
419 }
420 }
421 diff_image.put_pixel(
422 x,
423 SPECTROGRAM_HEIGHT - y - 1,
424 Rgba([
425 255,
426 255,
427 255,
428 (diff_sum as f64 / (x_range.len() * y_range.len()) as f64 * 255.0) as u8,
429 ]),
430 );
431 }
432 }
433 } else {
434 for x in 0..spectrogram_len {
435 let image_x_range = x as u32 * SPECTROGRAM_WIDTH / spectrogram_len as u32
436 ..(x + 1) as u32 * SPECTROGRAM_WIDTH / spectrogram_len as u32;
437 for y in 0..SPECTROGRAM_HEIGHT {
438 let y_range = spectrogram_log_bin_range(y);
439 let mut diff_sum = 0usize;
440 for y in y_range.clone() {
441 let expected = expected.get(x).map(|x| x[y]);
442 let actual = actual.get(x).map(|x| x[y]);
443 let diff = (expected.unwrap_or(f32::INFINITY) - actual.unwrap_or(f32::NEG_INFINITY)).abs();
444 total_count += 1;
445 if diff > self.spectral_tolerance {
446 diff_sum += 1;
447 diff_count += 1;
448 }
449 }
450 let color = Rgba([255, 255, 255, (diff_sum as f64 / y_range.len() as f64 * 255.0) as u8]);
451 for x in image_x_range.clone() {
452 diff_image.put_pixel(x, SPECTROGRAM_HEIGHT - y - 1, color);
453 }
454 }
455 }
456 }
457 let diff_rate = if total_count == 0 {
458 0.0
459 } else {
460 diff_count as f64 / total_count as f64
461 };
462 (diff_image, diff_rate)
463 }
464}
465
466pub fn audio_extension(kind: &Mime) -> Option<&'static str> {
467 match kind.essence_str() {
468 "audio/mpeg" => Some("mp3"),
469 "audio/wav" | "audio/x-wav" => Some("wav"),
470 "audio/flac" | "audio/x-flac" => Some("flac"),
471 "audio/aiff" | "audio/x-aiff" => Some("aiff"),
472 "audio/ogg" | "application/ogg" => Some("ogg"),
473 "audio/opus" => Some("opus"),
474 "audio/webm" | "video/webm" => Some("webm"),
475 "video/x-matroska" => Some("mkv"),
476 "audio/aac" => Some("aac"),
477 "audio/mp4" | "audio/m4a" | "audio/x-m4a" => Some("m4a"),
478 "video/mp4" => Some("mp4"),
479 "video/quicktime" => Some("mov"),
480 _ => mime_guess::get_mime_extensions(kind).and_then(|exts| exts.first().copied()),
481 }
482}
483
484fn is_audio_kind(kind: &Mime) -> bool {
485 kind.type_() == mime::AUDIO || kind.type_() == mime::VIDEO
486}
487
488fn build_audio_data_from_decoded(
489 mime: Mime,
490 content: Arc<Mmap>,
491 decoded: &AudioDecoded,
492 stat: &AudioStat,
493) -> AudioData {
494 let waveform = render_waveforms(&decoded.samples, stat, decoded.sample_rate);
495 let spectrogram = render_spectrograms(&decoded.spectrograms, stat, decoded.sample_rate);
496 AudioData {
497 mime,
498 sample_rate: decoded.sample_rate,
499 channels: decoded.channels,
500 duration_seconds: decoded.duration_seconds,
501 waveform,
502 spectrogram,
503 content,
504 }
505}
506
507struct AudioDecoded {
508 sample_rate: u32,
509 channels: u16,
510 duration_seconds: f32,
511 samples: Vec<Vec<f32>>,
512 spectrograms: Vec<Vec<[f32; SPECTROGRAM_DATA_HEIGHT]>>,
513}
514
515fn align_samples(
516 mut expected: Vec<Vec<f32>>,
517 mut actual: Vec<Vec<f32>>,
518 max_shift_samples: u32,
519) -> (Vec<Vec<f32>>, Vec<Vec<f32>>, i32) {
520 assert_eq!(expected.len(), actual.len());
521 if max_shift_samples == 0 || expected.is_empty() || expected[0].is_empty() || actual[0].is_empty() {
522 let signal_len = expected
523 .iter()
524 .chain(actual.iter())
525 .map(|signal| signal.len())
526 .min()
527 .unwrap_or(0);
528 expected
529 .iter_mut()
530 .chain(actual.iter_mut())
531 .for_each(|signal| signal.truncate(signal_len));
532 return (expected, actual, 0);
533 }
534 let convolution_len = expected
535 .iter()
536 .zip(actual.iter())
537 .map(|(expected, actual)| expected.len() + actual.len())
538 .max()
539 .unwrap();
540 let fft_len = convolution_len.next_power_of_two();
541 let mut planner = FftPlanner::new();
542 let forward = planner.plan_fft_forward(fft_len);
543 let backward = planner.plan_fft_inverse(fft_len);
544 struct CorrelationBuffer {
545 expected_buffer: Vec<Complex<f32>>,
546 actual_buffer: Vec<Complex<f32>>,
547 scratch_buffer: Vec<Complex<f32>>,
548 sums_expected: Vec<f32>,
549 sums_actual: Vec<f32>,
550 }
551 thread_local! {
552 static CORRELATION_BUFFER: RefCell<CorrelationBuffer> = const {
553 RefCell::new(CorrelationBuffer {
554 expected_buffer: Vec::new(),
555 actual_buffer: Vec::new(),
556 scratch_buffer: Vec::new(),
557 sums_expected: Vec::new(),
558 sums_actual: Vec::new(),
559 })
560 };
561 }
562 let correlation = CORRELATION_BUFFER.with(|correlation_buffer| {
563 let mut correlation_buffer = correlation_buffer.borrow_mut();
564 let CorrelationBuffer {
565 expected_buffer,
566 actual_buffer,
567 scratch_buffer,
568 sums_expected,
569 sums_actual,
570 } = &mut *correlation_buffer;
571 expected_buffer.resize(fft_len, Complex::zero());
572 actual_buffer.resize(fft_len, Complex::zero());
573 scratch_buffer.resize(
574 forward
575 .get_inplace_scratch_len()
576 .max(backward.get_inplace_scratch_len()),
577 Complex::zero(),
578 );
579 expected.iter().zip(actual.iter()).fold(
580 vec![0f32; (max_shift_samples * 2 + 1) as usize],
581 |mut acc, (expected, actual)| {
582 sums_expected.clear();
583 sums_actual.clear();
584 sums_expected.extend(
585 expected
586 .iter()
587 .map(|sample| sample * sample)
588 .chain(iter::once(0f32))
589 .scan(0f32, |acc, power| Some(mem::replace(acc, *acc + power))),
590 );
591 sums_actual.extend(
592 actual
593 .iter()
594 .map(|sample| sample * sample)
595 .chain(iter::once(0f32))
596 .scan(0f32, |acc, power| Some(mem::replace(acc, *acc + power))),
597 );
598
599 expected_buffer.fill(Complex::zero());
600 actual_buffer.fill(Complex::zero());
601 expected_buffer
602 .iter_mut()
603 .zip(expected.iter())
604 .for_each(|(buffer, sample)| *buffer = Complex::from(*sample));
605 actual_buffer
606 .iter_mut()
607 .zip(actual.iter().rev())
608 .for_each(|(buffer, sample)| *buffer = Complex::from(*sample));
609 forward.process_with_scratch(expected_buffer, scratch_buffer);
610 forward.process_with_scratch(actual_buffer, scratch_buffer);
611 expected_buffer
612 .iter_mut()
613 .zip(actual_buffer.iter())
614 .for_each(|(expected, actual)| *expected *= actual);
615 backward.process_with_scratch(expected_buffer, scratch_buffer);
616 acc.iter_mut()
617 .zip(-(max_shift_samples as i32)..=max_shift_samples as i32)
618 .for_each(|(acc, shift)| {
619 let (expected_range, actual_range) = overlap_range(expected.len(), actual.len(), shift);
620 let denom = ((sums_expected[expected_range.end] - sums_expected[expected_range.start])
621 * (sums_actual[actual_range.end] - sums_actual[actual_range.start]))
622 .sqrt()
623 .max(1e-6);
624 *acc += usize::try_from(actual.len() as isize - 1 - shift as isize)
625 .ok()
626 .and_then(|i| expected_buffer.get(i))
627 .map_or(f32::INFINITY, |v| v.re / denom);
628 });
629 acc
630 },
631 )
632 });
633 let (_, best_shift) = correlation
634 .iter()
635 .zip(-(max_shift_samples as i32)..=max_shift_samples as i32)
636 .max_by(|&(&c1, shift1), &(&c2, shift2)| {
637 if (c1 - c2).abs() < f32::EPSILON {
638 shift1.abs().cmp(&shift2.abs()).reverse()
639 } else {
640 c1.partial_cmp(&c2).unwrap()
641 }
642 })
643 .unwrap();
644
645 for (expected, actual) in expected.iter_mut().zip(actual.iter_mut()) {
646 let (expected_range, actual_range) = overlap_range(expected.len(), actual.len(), best_shift);
647 expected.drain(..expected_range.start.min(expected.len()));
648 actual.drain(..actual_range.start.min(actual.len()));
649 expected.truncate(expected_range.len());
650 actual.truncate(actual_range.len());
651 }
652
653 (expected, actual, best_shift)
654}
655
656fn summarize_channel_metrics(expected: &[Vec<f32>], actual: &[Vec<f32>]) -> f32 {
657 let channel_count = expected.len().min(actual.len());
658 if channel_count == 0 {
659 return f32::INFINITY;
660 }
661 let mut max_lufs_diff = 0.0f32;
662 for channel_index in 0..channel_count {
663 let expected_channel = &expected[channel_index];
664 let actual_channel = &actual[channel_index];
665 if expected_channel.is_empty() || actual_channel.is_empty() {
666 continue;
667 }
668 let expected_lufs = loudness_db(expected_channel);
669 let actual_lufs = loudness_db(actual_channel);
670 max_lufs_diff = max_lufs_diff.max((expected_lufs - actual_lufs).abs());
671 }
672 max_lufs_diff
673}
674
675fn render_waveforms(samples: &[Vec<f32>], stat: &AudioStat, sample_rate: u32) -> Vec<RgbaImage> {
676 samples
677 .iter()
678 .map(|channel| render_waveform(channel, stat, sample_rate))
679 .collect()
680}
681
682fn overlap_range(expected_len: usize, actual_len: usize, shift: i32) -> (Range<usize>, Range<usize>) {
683 if shift >= 0 {
684 let shift = shift as usize;
685 let len = expected_len.min(actual_len.saturating_sub(shift));
686 (0..len, shift..shift + len)
687 } else {
688 let shift = (-shift) as usize;
689 let len = actual_len.min(expected_len.saturating_sub(shift));
690 (shift..shift + len, 0..len)
691 }
692}
693
694fn loudness_db(samples: &[f32]) -> f32 {
695 if samples.is_empty() {
696 return -100.0;
697 }
698 let power = samples.iter().map(|sample| sample * sample).sum::<f32>() / samples.len() as f32;
699 let rms = power.sqrt();
700 20.0 * rms.max(LOG_EPSILON).log10()
701}
702
703fn render_waveform(samples: &[f32], stat: &AudioStat, sample_rate: u32) -> RgbaImage {
704 const WAVEFORM_COLOR: Rgba<u8> = Rgba([255, 255, 255, 255]);
705 let clip = (stat.signal_max * 1.2).clamp(LOG_EPSILON, 1.0);
706 let mut image = RgbaImage::from_pixel(WAVEFORM_WIDTH, WAVEFORM_HEIGHT, Rgba([255, 255, 255, 0]));
707 if stat.duration <= 0.0 || sample_rate == 0 {
708 return image;
709 }
710 let to_y = |value: f32| {
711 let normalized = (value + clip) / (2.0 * clip);
712 ((normalized * WAVEFORM_HEIGHT as f32).round() as u32).min(WAVEFORM_HEIGHT - 1)
713 };
714 let duration = stat.duration;
715 for x in 0..WAVEFORM_WIDTH {
716 let start_time = x as f32 * duration / WAVEFORM_WIDTH as f32;
717 let end_time = (x + 1) as f32 * duration / WAVEFORM_WIDTH as f32;
718 let start = (start_time * sample_rate as f32).floor() as usize;
719 let end = (end_time * sample_rate as f32).ceil() as usize;
720 let start = start.min(samples.len());
721 let end = end.min(samples.len());
722 if end <= start {
723 continue;
724 }
725 let (min, max) = samples[start..end]
726 .iter()
727 .fold((1.0f32, -1.0f32), |(min, max), &s| (min.min(s), max.max(s)));
728 let y_min = to_y(min);
729 let y_max = to_y(max);
730 for y in y_min..=y_max {
731 image.put_pixel(x, WAVEFORM_HEIGHT - y - 1, WAVEFORM_COLOR);
732 }
733 }
734 image
735}
736
737fn render_spectrograms(
738 spectrograms: &[Vec<[f32; SPECTROGRAM_DATA_HEIGHT]>],
739 stat: &AudioStat,
740 sample_rate: u32,
741) -> Vec<RgbaImage> {
742 spectrograms
743 .iter()
744 .map(|channel| render_spectrogram(channel, stat, sample_rate))
745 .collect()
746}
747
748fn spectrogram_log_bin_range(y: u32) -> Range<usize> {
749 static RANGES: LazyLock<[Range<usize>; SPECTROGRAM_HEIGHT as usize]> = LazyLock::new(|| {
750 const B: f64 = 20.0;
751 const A: f64 = SPECTROGRAM_DATA_HEIGHT as f64 / (B - 1.0);
752 let mut ranges = [const { 0usize..0 }; SPECTROGRAM_HEIGHT as usize];
753 let mut wrote = 0;
754 for y in 0..SPECTROGRAM_DATA_HEIGHT {
755 let p1 = f64::log(1.0 / A * y as f64 + 1.0, B);
756 let p2 = f64::log(1.0 / A * (y + 1) as f64 + 1.0, B);
757 let range = p1 * SPECTROGRAM_HEIGHT as f64..p2 * SPECTROGRAM_HEIGHT as f64;
758 if range.end - range.start < 1.0 {
759 break;
760 }
761 ranges[range.start.round() as usize..range.end.round() as usize].fill(y..y + 1);
762 wrote = y + 1;
763 }
764 for (y, slot) in ranges
765 .iter_mut()
766 .enumerate()
767 .take(SPECTROGRAM_HEIGHT as usize)
768 .skip(wrote)
769 {
770 let range = (A * (f64::powf(B, y as f64 / SPECTROGRAM_HEIGHT as f64) - 1.0)).round() as usize
771 ..(A * (f64::powf(B, (y + 1) as f64 / SPECTROGRAM_HEIGHT as f64) - 1.0)).round() as usize;
772 *slot = range;
773 }
774 ranges
775 });
776 RANGES[y as usize].clone()
777}
778
779struct SpectrogramAnalyzer {
780 fft: Arc<dyn Fft<f32>>,
781 window: Box<[f32]>,
782}
783
784impl Default for SpectrogramAnalyzer {
785 fn default() -> Self {
786 Self::new()
787 }
788}
789
790impl SpectrogramAnalyzer {
791 fn new() -> SpectrogramAnalyzer {
792 let fft = FftPlanner::<f32>::new().plan_fft_forward(FFT_WINDOW_SIZE);
793 let window = (0..FFT_WINDOW_SIZE)
794 .map(|i| (PI * i as f32 / (FFT_WINDOW_SIZE - 1) as f32).sin())
795 .collect();
796 SpectrogramAnalyzer { fft, window }
797 }
798
799 fn decode_audio(&self, mime: &Mime, content: &[u8]) -> Result<AudioDecoded, AudioDecodeError> {
800 let mut hint = Hint::new();
801 if let Some(extension) = audio_extension(mime) {
802 hint.with_extension(extension);
803 }
804
805 let owned = content.to_vec();
806 let mss = MediaSourceStream::new(Box::new(Cursor::new(owned)), Default::default());
807 let mut format =
808 symphonia::default::get_probe().probe(&hint, mss, FormatOptions::default(), MetadataOptions::default())?;
809 let track = format
810 .default_track(TrackType::Audio)
811 .ok_or(AudioDecodeError::NoDefaultTrack)?;
812 let track_id = track.id;
813 let codec_params = track
814 .codec_params
815 .clone()
816 .ok_or(AudioDecodeError::MissingAudioCodecParameters)?;
817 let audio_codec_params = codec_params
818 .audio()
819 .ok_or(AudioDecodeError::MissingAudioCodecParameters)?;
820 let mut decoder =
821 symphonia::default::get_codecs().make_audio_decoder(audio_codec_params, &AudioDecoderOptions::default())?;
822
823 let mut samples = Vec::<Vec<f32>>::new();
824 let mut signal_spec = match (audio_codec_params.sample_rate, audio_codec_params.channels.clone()) {
825 (Some(rate), Some(channels)) => Some(AudioSpec::new(rate, channels)),
826 _ => None,
827 };
828 loop {
829 let packet = match format.next_packet() {
830 Ok(Some(packet)) => packet,
831 Ok(None) => break,
832 Err(SymphoniaError::ResetRequired) => {
833 decoder.reset();
834 continue;
835 }
836 Err(err) => return Err(err.into()),
837 };
838 if packet.track_id != track_id {
839 continue;
840 }
841 let decoded = decoder.decode(&packet)?;
842 if signal_spec.is_none() {
843 signal_spec = Some(decoded.spec().clone());
844 }
845 let mut packet_samples = Vec::<Vec<f32>>::new();
846 decoded.copy_to_vecs_planar(&mut packet_samples);
847 samples.resize_with(packet_samples.len(), Vec::new);
848 for (plane, samples) in packet_samples.into_iter().zip(samples.iter_mut()) {
849 samples.extend(plane);
850 }
851 }
852
853 let Some(signal_spec) = signal_spec else {
854 return Err(AudioDecodeError::MissingSampleRate);
855 };
856
857 let max_len = samples.iter().map(|channel| channel.len()).max().unwrap_or(0);
858 let sample_rate = signal_spec.rate();
859 if sample_rate == 0 {
860 return Err(AudioDecodeError::MissingSampleRate);
861 }
862 let duration_seconds = max_len as f32 / sample_rate as f32;
863
864 let spectrograms = samples.iter().map(|sample| self.compute(sample)).collect::<Vec<_>>();
865
866 Ok(AudioDecoded {
867 sample_rate,
868 channels: signal_spec.channels().count() as u16,
869 duration_seconds,
870 samples,
871 spectrograms,
872 })
873 }
874
875 fn compute(&self, samples: &[f32]) -> Vec<[f32; SPECTROGRAM_DATA_HEIGHT]> {
876 let mut buffer =
877 Box::<[Complex<f32>; FFT_WINDOW_SIZE]>::try_from(vec![Complex::zero(); FFT_WINDOW_SIZE]).unwrap();
878 let mut scratch = vec![Complex::zero(); self.fft.get_inplace_scratch_len()];
879 let mut result = Vec::with_capacity(samples.len() / (FFT_WINDOW_SIZE / 2));
880 for i in 0.. {
881 let Some(samples) = samples.get(i * (FFT_WINDOW_SIZE / 2)..) else {
882 break;
883 };
884 buffer
885 .iter_mut()
886 .zip(
887 samples
888 .iter()
889 .copied()
890 .chain(iter::repeat(0.0))
891 .zip(self.window.iter().copied()),
892 )
893 .for_each(|(slot, (s, w))| *slot = Complex::from(s * w));
894 self.fft.process_with_scratch(&mut *buffer, &mut scratch);
895 result.push([0.0; SPECTROGRAM_DATA_HEIGHT]);
896 result
897 .last_mut()
898 .unwrap()
899 .iter_mut()
900 .zip(buffer.iter().copied())
901 .for_each(|(slot, b)| *slot = b.norm_sqr().max(1e-32).log10());
902 }
903 result
904 }
905}
906
907fn render_spectrogram(spectrogram: &[[f32; SPECTROGRAM_DATA_HEIGHT]], stat: &AudioStat, sample_rate: u32) -> RgbaImage {
908 let mut image = RgbaImage::from_pixel(SPECTROGRAM_WIDTH, SPECTROGRAM_HEIGHT, Rgba([255, 255, 255, 0]));
909 if spectrogram.is_empty() || stat.duration <= 0.0 || sample_rate == 0 {
910 return image;
911 }
912
913 let value_range = (stat.spectrogram_max - stat.spectrogram_min).max(LOG_EPSILON);
914 let map_value = |v: f32| (v - stat.spectrogram_min) / value_range;
915 assert!(SPECTROGRAM_HEIGHT <= SPECTROGRAM_DATA_HEIGHT as u32);
916 let duration = stat.duration;
917 let hop_samples = (FFT_WINDOW_SIZE / 2) as f32;
918 let frame_duration = hop_samples / sample_rate as f32;
919 if frame_duration <= 0.0 {
920 return image;
921 }
922
923 for x in 0..SPECTROGRAM_WIDTH {
924 let start_time = x as f32 * duration / SPECTROGRAM_WIDTH as f32;
925 let end_time = (x + 1) as f32 * duration / SPECTROGRAM_WIDTH as f32;
926 let start = (start_time / frame_duration).floor() as usize;
927 let end = (end_time / frame_duration).ceil() as usize;
928 let start = start.min(spectrogram.len());
929 let end = end.min(spectrogram.len());
930 if end <= start {
931 continue;
932 }
933
934 for y in 0..SPECTROGRAM_HEIGHT {
935 let y_range = spectrogram_log_bin_range(y);
936 let sum = spectrogram[start..end]
937 .iter()
938 .flat_map(|spec| spec[y_range.clone()].iter().copied())
939 .sum::<f32>();
940 let value = sum / ((end - start) * y_range.len()) as f32;
941 let intensity = map_value(value);
942 image.put_pixel(
943 x,
944 SPECTROGRAM_HEIGHT - y - 1,
945 Rgba([255, 255, 255, (intensity * 255.0) as u8]),
946 );
947 }
948 }
949 image
950}