1use image::{Rgba, RgbaImage};
2use memmap2::Mmap;
3use mime::Mime;
4use rustfft::num_complex::Complex;
5use rustfft::num_traits::Zero;
6use rustfft::{Fft, FftPlanner};
7use semdiff_core::fs::FileLeaf;
8use semdiff_core::{Diff, DiffCalculator, MayUnsupported};
9use std::f32::consts::PI;
10use std::fmt::{Debug, Formatter};
11use std::io::{Cursor, ErrorKind};
12use std::ops::Range;
13use std::sync::{Arc, LazyLock};
14use std::{convert, iter};
15use symphonia::core::audio::{AudioBuffer, SignalSpec};
16use symphonia::core::codecs::DecoderOptions;
17use symphonia::core::errors::Error as SymphoniaError;
18use symphonia::core::formats::FormatOptions;
19use symphonia::core::io::MediaSourceStream;
20use symphonia::core::meta::MetadataOptions;
21use symphonia::core::probe::Hint;
22use thiserror::Error;
23
24pub mod report_html;
25pub mod report_json;
26pub mod report_summary;
27
28#[cfg(test)]
29mod tests;
30
31const WAVEFORM_WIDTH: u32 = 1024;
32const WAVEFORM_HEIGHT: u32 = 256;
33const SPECTROGRAM_WIDTH: u32 = 1024;
34const SPECTROGRAM_HEIGHT: u32 = 256;
35const SPECTROGRAM_DATA_HEIGHT: usize = 1024;
36const FFT_WINDOW_SIZE: usize = SPECTROGRAM_DATA_HEIGHT * 2;
37const LOG_EPSILON: f32 = 1e-6;
38
39pub struct AudioDiffReporter {
40 spectrogram_analyzer: SpectrogramAnalyzer,
41}
42
43impl Default for AudioDiffReporter {
44 fn default() -> Self {
45 Self::new()
46 }
47}
48
49impl AudioDiffReporter {
50 pub fn new() -> AudioDiffReporter {
51 AudioDiffReporter {
52 spectrogram_analyzer: SpectrogramAnalyzer::new(),
53 }
54 }
55
56 fn build_audio_data(&self, kind: Mime, content: Arc<Mmap>) -> Result<AudioData, AudioDecodeError> {
57 let decoded = self.spectrogram_analyzer.decode_audio(&kind, &content)?;
58 let stat = AudioStat::from_one(&decoded);
59 Ok(build_audio_data_from_decoded(kind, content, &decoded, &stat))
60 }
61}
62
63#[derive(Debug)]
64pub enum AudioDiffStatus {
65 Equal(AudioDiffDetail),
66 Different(AudioDiffDetail),
67 Incomparable,
68}
69
70impl AudioDiffStatus {
71 pub fn as_str(&self) -> &'static str {
72 match self {
73 AudioDiffStatus::Equal(_) => "equal",
74 AudioDiffStatus::Different(_) => "different",
75 AudioDiffStatus::Incomparable => "incomparable",
76 }
77 }
78}
79
80#[derive(Debug)]
81pub struct AudioDiff {
82 status: AudioDiffStatus,
83 expected: AudioData,
84 actual: AudioData,
85}
86
87impl Diff for AudioDiff {
88 fn equal(&self) -> bool {
89 matches!(self.status, AudioDiffStatus::Equal(_))
90 }
91}
92
93impl AudioDiff {
94 fn status(&self) -> &AudioDiffStatus {
95 &self.status
96 }
97
98 fn expected(&self) -> &AudioData {
99 &self.expected
100 }
101
102 fn actual(&self) -> &AudioData {
103 &self.actual
104 }
105
106 fn diff_detail(&self) -> Option<&AudioDiffDetail> {
107 match &self.status {
108 AudioDiffStatus::Equal(detail) | AudioDiffStatus::Different(detail) => Some(detail),
109 AudioDiffStatus::Incomparable => None,
110 }
111 }
112}
113
114#[derive(Debug)]
115pub struct AudioDiffDetail {
116 spectrogram_diff: Vec<RgbaImage>,
117 stat: AudioDiffStat,
118}
119
120impl AudioDiffDetail {
121 fn spectrogram_diff(&self) -> &[RgbaImage] {
122 &self.spectrogram_diff
123 }
124
125 fn stat(&self) -> &AudioDiffStat {
126 &self.stat
127 }
128}
129
130#[derive(Debug, Clone, Copy)]
131pub struct AudioDiffStat {
132 pub spectrogram_diff_rate: f64,
133 pub shift_samples: i32,
134 pub lufs_diff_db: f32,
135}
136
137#[derive(Debug)]
138pub struct AudioData {
139 mime: Mime,
140 sample_rate: u32,
141 channels: u16,
142 duration_seconds: f32,
143 waveform: Vec<RgbaImage>,
144 spectrogram: Vec<RgbaImage>,
145 content: Arc<Mmap>,
146}
147
148impl AudioData {
149 fn mime(&self) -> &Mime {
150 &self.mime
151 }
152
153 fn sample_rate(&self) -> u32 {
154 self.sample_rate
155 }
156
157 fn channels(&self) -> u16 {
158 self.channels
159 }
160
161 fn duration_seconds(&self) -> f32 {
162 self.duration_seconds
163 }
164
165 fn waveform(&self) -> &[RgbaImage] {
166 &self.waveform
167 }
168
169 fn spectrogram(&self) -> &[RgbaImage] {
170 &self.spectrogram
171 }
172
173 fn content(&self) -> &[u8] {
174 &self.content
175 }
176}
177
178#[derive(Debug, Error)]
179pub enum AudioDecodeError {
180 #[error("symphonia error: {0}")]
181 Symphonia(#[from] SymphoniaError),
182 #[error("no default audio track")]
183 NoDefaultTrack,
184 #[error("missing sample rate")]
185 MissingSampleRate,
186}
187
188#[derive(Default)]
189pub struct AudioDiffCalculator {
190 shift_tolerance_seconds: f32,
191 lufs_tolerance_db: f32,
192 spectral_tolerance: f32,
193 spectrogram_diff_rate_tolerance: f64,
194 spectrogram_analyzer: SpectrogramAnalyzer,
195}
196
197impl Debug for AudioDiffCalculator {
198 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
199 f.debug_struct("AudioDiffCalculator")
200 .field("shift_tolerance_seconds", &self.shift_tolerance_seconds)
201 .field("lufs_tolerance_db", &self.lufs_tolerance_db)
202 .field("spectral_tolerance", &self.spectral_tolerance)
203 .field("spectrogram_diff_rate_tolerance", &self.spectrogram_diff_rate_tolerance)
204 .finish()
205 }
206}
207
208impl AudioDiffCalculator {
209 fn diff_decoded(&self, expected: &AudioDecoded, actual: &AudioDecoded) -> AudioDiffStatus {
210 if (expected.sample_rate, expected.channels) != (actual.sample_rate, actual.channels) {
211 return AudioDiffStatus::Incomparable;
212 }
213
214 let sample_rate = expected.sample_rate;
215 let max_shift_samples = (self.shift_tolerance_seconds * sample_rate as f32).round() as i32;
216 let (aligned_expected, aligned_actual, shift_samples) =
217 align_samples(expected.samples.clone(), actual.samples.clone(), max_shift_samples);
218
219 let expected_spectrogram = aligned_expected
220 .iter()
221 .map(|channel| self.spectrogram_analyzer.compute(channel))
222 .collect::<Vec<_>>();
223 let actual_spectrogram = aligned_actual
224 .iter()
225 .map(|channel| self.spectrogram_analyzer.compute(channel))
226 .collect::<Vec<_>>();
227
228 let (spectrogram_diff, spectrogram_diff_rate) =
229 self.build_diff_images(&expected_spectrogram, &actual_spectrogram);
230
231 let lufs_diff_db = summarize_channel_metrics(&aligned_expected, &aligned_actual);
232
233 let detail = AudioDiffDetail {
234 spectrogram_diff,
235 stat: AudioDiffStat {
236 spectrogram_diff_rate,
237 shift_samples,
238 lufs_diff_db,
239 },
240 };
241
242 let equal =
243 lufs_diff_db <= self.lufs_tolerance_db && spectrogram_diff_rate <= self.spectrogram_diff_rate_tolerance;
244 if equal {
245 AudioDiffStatus::Equal(detail)
246 } else {
247 AudioDiffStatus::Different(detail)
248 }
249 }
250}
251
252impl DiffCalculator<FileLeaf> for AudioDiffCalculator {
253 type Error = convert::Infallible;
254 type Diff = AudioDiff;
255
256 fn diff(
257 &self,
258 _name: &str,
259 expected: FileLeaf,
260 actual: FileLeaf,
261 ) -> Result<MayUnsupported<Self::Diff>, Self::Error> {
262 if !is_audio_kind(&expected.kind) || !is_audio_kind(&actual.kind) {
263 return Ok(MayUnsupported::Unsupported);
264 }
265 let Ok(expected_decoded) = self
266 .spectrogram_analyzer
267 .decode_audio(&expected.kind, expected.content.as_ref())
268 else {
269 return Ok(MayUnsupported::Unsupported);
270 };
271 let Ok(actual_decoded) = self
272 .spectrogram_analyzer
273 .decode_audio(&actual.kind, actual.content.as_ref())
274 else {
275 return Ok(MayUnsupported::Unsupported);
276 };
277 let stat_decoded = AudioStat::from_pair(&expected_decoded, &actual_decoded);
278 let expected_data =
279 build_audio_data_from_decoded(expected.kind, expected.content, &expected_decoded, &stat_decoded);
280 let actual_data = build_audio_data_from_decoded(actual.kind, actual.content, &actual_decoded, &stat_decoded);
281 let status = self.diff_decoded(&expected_decoded, &actual_decoded);
282
283 Ok(MayUnsupported::Ok(AudioDiff {
284 status,
285 expected: expected_data,
286 actual: actual_data,
287 }))
288 }
289}
290
291#[derive(Debug)]
292struct AudioStat {
293 signal_max: f32,
294 spectrogram_min: f32,
295 spectrogram_max: f32,
296 duration: f32,
297}
298
299impl AudioStat {
300 fn from_one(decoded: &AudioDecoded) -> AudioStat {
301 let signal_max = decoded
302 .samples
303 .iter()
304 .flatten()
305 .copied()
306 .map(f32::abs)
307 .fold(0.0, f32::max);
308 let (spectrogram_min, spectrogram_max) = decoded
309 .spectrograms
310 .iter()
311 .flatten()
312 .flatten()
313 .copied()
314 .fold((f32::INFINITY, f32::NEG_INFINITY), |(min, max), v| {
315 (v.min(min), v.max(max))
316 });
317 let duration = decoded.duration_seconds;
318 AudioStat {
319 signal_max,
320 spectrogram_min,
321 spectrogram_max,
322 duration,
323 }
324 }
325
326 fn from_pair(expected: &AudioDecoded, actual: &AudioDecoded) -> AudioStat {
327 let signal_max = expected
328 .samples
329 .iter()
330 .chain(actual.samples.iter())
331 .flatten()
332 .copied()
333 .map(f32::abs)
334 .fold(0.0, f32::max);
335 let (spectrogram_min, spectrogram_max) = expected
336 .spectrograms
337 .iter()
338 .chain(actual.spectrograms.iter())
339 .flatten()
340 .flatten()
341 .copied()
342 .filter(|v| v.is_finite())
343 .fold((f32::INFINITY, f32::NEG_INFINITY), |(min, max), v| {
344 (v.min(min), v.max(max))
345 });
346 let duration = expected.duration_seconds.max(actual.duration_seconds);
347 AudioStat {
348 signal_max,
349 spectrogram_min,
350 spectrogram_max,
351 duration,
352 }
353 }
354}
355
356impl AudioDiffCalculator {
357 pub fn new(
358 shift_tolerance_seconds: f32,
359 lufs_tolerance_db: f32,
360 spectral_tolerance: f32,
361 spectrogram_diff_rate_tolerance: f64,
362 ) -> Self {
363 Self {
364 shift_tolerance_seconds,
365 lufs_tolerance_db,
366 spectral_tolerance,
367 spectrogram_diff_rate_tolerance,
368 spectrogram_analyzer: SpectrogramAnalyzer::new(),
369 }
370 }
371
372 fn build_diff_images(
373 &self,
374 expected: &[Vec<[f32; SPECTROGRAM_DATA_HEIGHT]>],
375 actual: &[Vec<[f32; SPECTROGRAM_DATA_HEIGHT]>],
376 ) -> (Vec<RgbaImage>, f64) {
377 assert_eq!(expected.len(), actual.len());
378 let mut diff_images = Vec::with_capacity(expected.len());
379 let mut diff_rate_sum = 0.0;
380 for (expected_frame, actual_frame) in expected.iter().zip(actual.iter()) {
381 let (diff_image, diff_rate) = self.diff_spectrograms(expected_frame, actual_frame);
382 diff_images.push(diff_image);
383 diff_rate_sum += diff_rate;
384 }
385 (diff_images, diff_rate_sum / expected.len() as f64)
386 }
387
388 fn diff_spectrograms(
389 &self,
390 expected: &[[f32; SPECTROGRAM_DATA_HEIGHT]],
391 actual: &[[f32; SPECTROGRAM_DATA_HEIGHT]],
392 ) -> (RgbaImage, f64) {
393 let spectrogram_len = expected.len().max(actual.len());
394 let mut diff_image = RgbaImage::from_pixel(SPECTROGRAM_WIDTH, SPECTROGRAM_HEIGHT, Rgba([255, 255, 255, 0]));
395 let mut diff_count = 0usize;
396 let mut total_count = 0usize;
397 assert!(SPECTROGRAM_DATA_HEIGHT >= SPECTROGRAM_HEIGHT as usize);
398 if spectrogram_len >= SPECTROGRAM_WIDTH as usize {
399 for x in 0..SPECTROGRAM_WIDTH {
400 let x_range = x as usize * spectrogram_len / SPECTROGRAM_WIDTH as usize
401 ..(x + 1) as usize * spectrogram_len / SPECTROGRAM_WIDTH as usize;
402 for y in 0..SPECTROGRAM_HEIGHT {
403 let y_range = spectrogram_log_bin_range(y);
404
405 let mut diff_sum = 0usize;
406 for y in y_range.clone() {
407 for x in x_range.clone() {
408 let expected = expected.get(x).map(|x| x[y]);
409 let actual = actual.get(x).map(|x| x[y]);
410 let diff = (expected.unwrap_or(f32::INFINITY) - actual.unwrap_or(f32::NEG_INFINITY)).abs();
411 total_count += 1;
412 if diff > self.spectral_tolerance {
413 diff_sum += 1;
414 diff_count += 1;
415 }
416 }
417 }
418 diff_image.put_pixel(
419 x,
420 SPECTROGRAM_HEIGHT - y - 1,
421 Rgba([
422 255,
423 255,
424 255,
425 (diff_sum as f64 / (x_range.len() * y_range.len()) as f64 * 255.0) as u8,
426 ]),
427 );
428 }
429 }
430 } else {
431 for x in 0..spectrogram_len {
432 let image_x_range = x as u32 * SPECTROGRAM_WIDTH / spectrogram_len as u32
433 ..(x + 1) as u32 * SPECTROGRAM_WIDTH / spectrogram_len as u32;
434 for y in 0..SPECTROGRAM_HEIGHT {
435 let y_range = spectrogram_log_bin_range(y);
436 let mut diff_sum = 0usize;
437 for y in y_range.clone() {
438 let expected = expected.get(x).map(|x| x[y]);
439 let actual = actual.get(x).map(|x| x[y]);
440 let diff = (expected.unwrap_or(f32::INFINITY) - actual.unwrap_or(f32::NEG_INFINITY)).abs();
441 total_count += 1;
442 if diff > self.spectral_tolerance {
443 diff_sum += 1;
444 diff_count += 1;
445 }
446 }
447 let color = Rgba([255, 255, 255, (diff_sum as f64 / y_range.len() as f64 * 255.0) as u8]);
448 for x in image_x_range.clone() {
449 diff_image.put_pixel(x, SPECTROGRAM_HEIGHT - y - 1, color);
450 }
451 }
452 }
453 }
454 let diff_rate = if total_count == 0 {
455 0.0
456 } else {
457 diff_count as f64 / total_count as f64
458 };
459 (diff_image, diff_rate)
460 }
461}
462
463pub fn audio_extension(kind: &Mime) -> Option<&'static str> {
464 match kind.essence_str() {
465 "audio/mpeg" => Some("mp3"),
466 "audio/wav" | "audio/x-wav" => Some("wav"),
467 "audio/flac" => Some("flac"),
468 "audio/ogg" | "application/ogg" => Some("ogg"),
469 "audio/opus" => Some("opus"),
470 "audio/webm" => Some("webm"),
471 "audio/aac" => Some("aac"),
472 "audio/mp4" | "video/mp4" => Some("m4a"),
473 "audio/x-m4a" => Some("m4a"),
474 _ => mime_guess::get_mime_extensions(kind).and_then(|exts| exts.first().copied()),
475 }
476}
477
478fn is_audio_kind(kind: &Mime) -> bool {
479 kind.type_() == mime::AUDIO || kind.type_() == mime::VIDEO
480}
481
482fn build_audio_data_from_decoded(
483 mime: Mime,
484 content: Arc<Mmap>,
485 decoded: &AudioDecoded,
486 stat: &AudioStat,
487) -> AudioData {
488 let waveform = render_waveforms(&decoded.samples, stat, decoded.sample_rate);
489 let spectrogram = render_spectrograms(&decoded.spectrograms, stat, decoded.sample_rate);
490 AudioData {
491 mime,
492 sample_rate: decoded.sample_rate,
493 channels: decoded.channels,
494 duration_seconds: decoded.duration_seconds,
495 waveform,
496 spectrogram,
497 content,
498 }
499}
500
501struct AudioDecoded {
502 sample_rate: u32,
503 channels: u16,
504 duration_seconds: f32,
505 samples: Vec<Vec<f32>>,
506 spectrograms: Vec<Vec<[f32; SPECTROGRAM_DATA_HEIGHT]>>,
507}
508
509fn align_samples(
510 mut expected: Vec<Vec<f32>>,
511 mut actual: Vec<Vec<f32>>,
512 max_shift_samples: i32,
513) -> (Vec<Vec<f32>>, Vec<Vec<f32>>, i32) {
514 assert_eq!(expected.len(), actual.len());
515 let best_shift = (-max_shift_samples..=max_shift_samples)
516 .map(|shift| {
517 let score_sum = expected
518 .iter()
519 .zip(actual.iter())
520 .map(|(expected_channel, actual_channel)| {
521 let (expected_slice, actual_slice) = overlap_slices(expected_channel, actual_channel, shift);
522 normalized_correlation(expected_slice, actual_slice)
523 })
524 .sum::<f32>();
525 (shift, score_sum)
526 })
527 .min_by(|&(_, score1), &(_, score2)| score1.partial_cmp(&score2).unwrap())
528 .map_or(0, |(shift, _)| shift);
529
530 for (expected, actual) in expected.iter_mut().zip(actual.iter_mut()) {
531 let (expected_range, actual_range) = overlap_range(expected.len(), actual.len(), best_shift);
532 expected.drain(..expected_range.start.min(expected.len()));
533 actual.drain(..actual_range.start.min(actual.len()));
534 }
535
536 (expected, actual, best_shift)
537}
538
539fn summarize_channel_metrics(expected: &[Vec<f32>], actual: &[Vec<f32>]) -> f32 {
540 let channel_count = expected.len().min(actual.len());
541 if channel_count == 0 {
542 return f32::INFINITY;
543 }
544 let mut max_lufs_diff = 0.0f32;
545 for channel_index in 0..channel_count {
546 let expected_channel = &expected[channel_index];
547 let actual_channel = &actual[channel_index];
548 if expected_channel.is_empty() || actual_channel.is_empty() {
549 continue;
550 }
551 let expected_lufs = loudness_db(expected_channel);
552 let actual_lufs = loudness_db(actual_channel);
553 max_lufs_diff = max_lufs_diff.max((expected_lufs - actual_lufs).abs());
554 }
555 max_lufs_diff
556}
557
558fn render_waveforms(samples: &[Vec<f32>], stat: &AudioStat, sample_rate: u32) -> Vec<RgbaImage> {
559 samples
560 .iter()
561 .map(|channel| render_waveform(channel, stat, sample_rate))
562 .collect()
563}
564
565fn overlap_range(expected_len: usize, actual_len: usize, shift: i32) -> (Range<usize>, Range<usize>) {
566 if shift >= 0 {
567 let shift = shift as usize;
568 let len = expected_len.min(actual_len.saturating_sub(shift));
569 (0..len, shift..shift + len)
570 } else {
571 let shift = (-shift) as usize;
572 let len = actual_len.min(expected_len.saturating_sub(shift));
573 (shift..shift + len, 0..len)
574 }
575}
576
577fn overlap_slices<'a>(expected: &'a [f32], actual: &'a [f32], shift: i32) -> (&'a [f32], &'a [f32]) {
578 let (expected_range, actual_range) = overlap_range(expected.len(), actual.len(), shift);
579 (&expected[expected_range], &actual[actual_range])
580}
581
582fn normalized_correlation(expected: &[f32], actual: &[f32]) -> f32 {
583 assert_eq!(expected.len(), actual.len());
584 let mut dot = 0.0f32;
585 let mut expected_power = 0.0f32;
586 let mut actual_power = 0.0f32;
587 for (&e, &a) in expected.iter().zip(actual.iter()) {
588 dot += e * a;
589 expected_power += e * e;
590 actual_power += a * a;
591 }
592 let denom = (expected_power.sqrt() * actual_power.sqrt()).max(LOG_EPSILON);
593 dot / denom
594}
595
596fn loudness_db(samples: &[f32]) -> f32 {
597 if samples.is_empty() {
598 return -100.0;
599 }
600 let power = samples.iter().map(|sample| sample * sample).sum::<f32>() / samples.len() as f32;
601 let rms = power.sqrt();
602 20.0 * rms.max(LOG_EPSILON).log10()
603}
604
605fn render_waveform(samples: &[f32], stat: &AudioStat, sample_rate: u32) -> RgbaImage {
606 const WAVEFORM_COLOR: Rgba<u8> = Rgba([255, 255, 255, 255]);
607 let clip = (stat.signal_max * 1.2).clamp(LOG_EPSILON, 1.0);
608 let mut image = RgbaImage::from_pixel(WAVEFORM_WIDTH, WAVEFORM_HEIGHT, Rgba([255, 255, 255, 0]));
609 if stat.duration <= 0.0 || sample_rate == 0 {
610 return image;
611 }
612 let to_y = |value: f32| {
613 let normalized = (value + clip) / (2.0 * clip);
614 ((normalized * WAVEFORM_HEIGHT as f32).round() as u32).min(WAVEFORM_HEIGHT - 1)
615 };
616 let duration = stat.duration;
617 for x in 0..WAVEFORM_WIDTH {
618 let start_time = x as f32 * duration / WAVEFORM_WIDTH as f32;
619 let end_time = (x + 1) as f32 * duration / WAVEFORM_WIDTH as f32;
620 let start = (start_time * sample_rate as f32).floor() as usize;
621 let end = (end_time * sample_rate as f32).ceil() as usize;
622 let start = start.min(samples.len());
623 let end = end.min(samples.len());
624 if end <= start {
625 continue;
626 }
627 let (min, max) = samples[start..end]
628 .iter()
629 .fold((1.0f32, -1.0f32), |(min, max), &s| (min.min(s), max.max(s)));
630 let y_min = to_y(min);
631 let y_max = to_y(max);
632 for y in y_min..=y_max {
633 image.put_pixel(x, WAVEFORM_HEIGHT - y - 1, WAVEFORM_COLOR);
634 }
635 }
636 image
637}
638
639fn render_spectrograms(
640 spectrograms: &[Vec<[f32; SPECTROGRAM_DATA_HEIGHT]>],
641 stat: &AudioStat,
642 sample_rate: u32,
643) -> Vec<RgbaImage> {
644 spectrograms
645 .iter()
646 .map(|channel| render_spectrogram(channel, stat, sample_rate))
647 .collect()
648}
649
650fn spectrogram_log_bin_range(y: u32) -> Range<usize> {
651 static RANGES: LazyLock<[Range<usize>; SPECTROGRAM_HEIGHT as usize]> = LazyLock::new(|| {
652 const B: f64 = 20.0;
653 const A: f64 = SPECTROGRAM_DATA_HEIGHT as f64 / (B - 1.0);
654 let mut ranges = [const { 0usize..0 }; SPECTROGRAM_HEIGHT as usize];
655 let mut wrote = 0;
656 for y in 0..SPECTROGRAM_DATA_HEIGHT {
657 let p1 = f64::log(1.0 / A * y as f64 + 1.0, B);
658 let p2 = f64::log(1.0 / A * (y + 1) as f64 + 1.0, B);
659 let range = p1 * SPECTROGRAM_HEIGHT as f64..p2 * SPECTROGRAM_HEIGHT as f64;
660 if range.end - range.start < 1.0 {
661 break;
662 }
663 ranges[range.start.round() as usize..range.end.round() as usize].fill(y..y + 1);
664 wrote = y + 1;
665 }
666 for (y, slot) in ranges
667 .iter_mut()
668 .enumerate()
669 .take(SPECTROGRAM_HEIGHT as usize)
670 .skip(wrote)
671 {
672 let range = (A * (f64::powf(B, y as f64 / SPECTROGRAM_HEIGHT as f64) - 1.0)).round() as usize
673 ..(A * (f64::powf(B, (y + 1) as f64 / SPECTROGRAM_HEIGHT as f64) - 1.0)).round() as usize;
674 *slot = range;
675 }
676 ranges
677 });
678 RANGES[y as usize].clone()
679}
680
681struct SpectrogramAnalyzer {
682 fft: Arc<dyn Fft<f32>>,
683 window: Box<[f32]>,
684}
685
686impl Default for SpectrogramAnalyzer {
687 fn default() -> Self {
688 Self::new()
689 }
690}
691
692impl SpectrogramAnalyzer {
693 fn new() -> SpectrogramAnalyzer {
694 let fft = FftPlanner::<f32>::new().plan_fft_forward(FFT_WINDOW_SIZE);
695 let window = (0..FFT_WINDOW_SIZE)
696 .map(|i| (PI * i as f32 / (FFT_WINDOW_SIZE - 1) as f32).sin())
697 .collect();
698 SpectrogramAnalyzer { fft, window }
699 }
700
701 fn decode_audio(&self, mime: &Mime, content: &[u8]) -> Result<AudioDecoded, AudioDecodeError> {
702 let mut hint = Hint::new();
703 if let Some(extension) = audio_extension(mime) {
704 hint.with_extension(extension);
705 }
706
707 let owned = content.to_vec();
708 let mss = MediaSourceStream::new(Box::new(Cursor::new(owned)), Default::default());
709 let probed = symphonia::default::get_probe().format(
710 &hint,
711 mss,
712 &FormatOptions::default(),
713 &MetadataOptions::default(),
714 )?;
715 let mut format = probed.format;
716 let track = format.default_track().ok_or(AudioDecodeError::NoDefaultTrack)?;
717 let track_id = track.id;
718 let codec_params = track.codec_params.clone();
719 let mut decoder = symphonia::default::get_codecs().make(&codec_params, &DecoderOptions::default())?;
720
721 let mut samples = vec![Vec::new()];
722 let mut signal_spec = if let Some(rate) = codec_params.sample_rate
723 && let Some(channels) = codec_params.channels
724 {
725 Some(SignalSpec { rate, channels })
726 } else {
727 None
728 };
729 let mut sample_buf = None::<AudioBuffer<f32>>;
730 loop {
731 let packet = match format.next_packet() {
732 Ok(packet) => packet,
733 Err(SymphoniaError::ResetRequired) => {
734 decoder.reset();
735 continue;
736 }
737 Err(SymphoniaError::IoError(e)) if e.kind() == ErrorKind::UnexpectedEof => break,
738 Err(err) => return Err(err.into()),
739 };
740 if packet.track_id() != track_id {
741 continue;
742 }
743 let decoded = decoder.decode(&packet)?;
744 let spec = *decoded.spec();
745 if signal_spec.is_none() {
746 signal_spec = Some(spec);
747 }
748 let sample_buf = sample_buf.get_or_insert_with(|| AudioBuffer::<f32>::new(decoded.capacity() as u64, spec));
749 decoded.convert(sample_buf);
750 samples.resize_with(sample_buf.planes().planes().len(), Vec::new);
751 for (plane, samples) in sample_buf.planes().planes().iter().zip(samples.iter_mut()) {
752 samples.extend_from_slice(plane);
753 }
754 }
755
756 let Some(signal_spec) = signal_spec else {
757 return Err(AudioDecodeError::MissingSampleRate);
758 };
759
760 let max_len = samples.iter().map(|channel| channel.len()).max().unwrap_or(0);
761 let duration_seconds = max_len as f32 / signal_spec.rate as f32;
762
763 let spectrograms = samples.iter().map(|sample| self.compute(sample)).collect::<Vec<_>>();
764
765 Ok(AudioDecoded {
766 sample_rate: signal_spec.rate,
767 channels: signal_spec.channels.count() as u16,
768 duration_seconds,
769 samples,
770 spectrograms,
771 })
772 }
773
774 fn compute(&self, samples: &[f32]) -> Vec<[f32; SPECTROGRAM_DATA_HEIGHT]> {
775 let mut buffer =
776 Box::<[Complex<f32>; FFT_WINDOW_SIZE]>::try_from(vec![Complex::zero(); FFT_WINDOW_SIZE]).unwrap();
777 let mut scratch = vec![Complex::zero(); self.fft.get_inplace_scratch_len()];
778 let mut result = Vec::with_capacity(samples.len() / (FFT_WINDOW_SIZE / 2));
779 for i in 0.. {
780 let Some(samples) = samples.get(i * (FFT_WINDOW_SIZE / 2)..) else {
781 break;
782 };
783 buffer
784 .iter_mut()
785 .zip(
786 samples
787 .iter()
788 .copied()
789 .chain(iter::repeat(0.0))
790 .zip(self.window.iter().copied()),
791 )
792 .for_each(|(slot, (s, w))| *slot = Complex::from(s * w));
793 self.fft.process_with_scratch(&mut *buffer, &mut scratch);
794 result.push([0.0; SPECTROGRAM_DATA_HEIGHT]);
795 result
796 .last_mut()
797 .unwrap()
798 .iter_mut()
799 .zip(buffer.iter().copied())
800 .for_each(|(slot, b)| *slot = b.norm_sqr().max(1e-32).log10());
801 }
802 result
803 }
804}
805
806fn render_spectrogram(spectrogram: &[[f32; SPECTROGRAM_DATA_HEIGHT]], stat: &AudioStat, sample_rate: u32) -> RgbaImage {
807 let mut image = RgbaImage::from_pixel(SPECTROGRAM_WIDTH, SPECTROGRAM_HEIGHT, Rgba([255, 255, 255, 0]));
808 if spectrogram.is_empty() || stat.duration <= 0.0 || sample_rate == 0 {
809 return image;
810 }
811
812 let value_range = (stat.spectrogram_max - stat.spectrogram_min).max(LOG_EPSILON);
813 let map_value = |v: f32| (v - stat.spectrogram_min) / value_range;
814 assert!(SPECTROGRAM_HEIGHT <= SPECTROGRAM_DATA_HEIGHT as u32);
815 let duration = stat.duration;
816 let hop_samples = (FFT_WINDOW_SIZE / 2) as f32;
817 let frame_duration = hop_samples / sample_rate as f32;
818 if frame_duration <= 0.0 {
819 return image;
820 }
821
822 for x in 0..SPECTROGRAM_WIDTH {
823 let start_time = x as f32 * duration / SPECTROGRAM_WIDTH as f32;
824 let end_time = (x + 1) as f32 * duration / SPECTROGRAM_WIDTH as f32;
825 let start = (start_time / frame_duration).floor() as usize;
826 let end = (end_time / frame_duration).ceil() as usize;
827 let start = start.min(spectrogram.len());
828 let end = end.min(spectrogram.len());
829 if end <= start {
830 continue;
831 }
832
833 for y in 0..SPECTROGRAM_HEIGHT {
834 let y_range = spectrogram_log_bin_range(y);
835 let sum = spectrogram[start..end]
836 .iter()
837 .flat_map(|spec| spec[y_range.clone()].iter().copied())
838 .sum::<f32>();
839 let value = sum / ((end - start) * y_range.len()) as f32;
840 let intensity = map_value(value);
841 image.put_pixel(
842 x,
843 SPECTROGRAM_HEIGHT - y - 1,
844 Rgba([255, 255, 255, (intensity * 255.0) as u8]),
845 );
846 }
847 }
848 image
849}