1#![doc = include_str!("../README.md")]
2
3mod clip;
4pub mod surface;
5pub use clip::{AudioClip, ConcatPolicy, FadeCurve, MixPolicy};
6pub use math_signal_core::{
8 BiquadCoefficients, BiquadDesign, FirKernel1d, FrameStride, InterpolationMode, ResampleRatio,
9 ResampleSpec, SampleRate, WindowFunction, WindowSpec,
10};
11use std::collections::{BTreeMap, BTreeSet};
12
13use tensor_data::{F32Tensor, F32TensorView};
14use video_analysis_core::{AudioBuffer, AudioFrame, DetectError, Result, Timebase, Timestamp};
15
16#[derive(Debug, Clone, Copy, PartialEq, Eq)]
17pub struct AudioFormatSpec {
19 pub sample_rate: u32,
21 pub channels: u16,
23 pub frame_samples: Option<usize>,
25}
26
27impl AudioFormatSpec {
28 pub fn new(sample_rate: u32, channels: u16) -> Result<Self> {
30 let spec = Self {
31 sample_rate,
32 channels,
33 frame_samples: None,
34 };
35 spec.validate()?;
36 Ok(spec)
37 }
38
39 pub fn frame_samples(mut self, frame_samples: usize) -> Result<Self> {
41 self.frame_samples = Some(frame_samples);
42 self.validate()?;
43 Ok(self)
44 }
45
46 pub fn validate(&self) -> Result<()> {
48 if self.sample_rate == 0 || self.channels == 0 {
49 return Err(DetectError::InvalidAudioFormat {
50 sample_rate: self.sample_rate,
51 channels: self.channels,
52 });
53 }
54 if self.frame_samples == Some(0) {
55 return Err(DetectError::InvalidArgument(
56 "frame_samples must be greater than zero".to_string(),
57 ));
58 }
59 Ok(())
60 }
61
62 pub fn duration_seconds(&self, samples_per_channel: usize) -> Result<f64> {
64 self.validate()?;
65 Ok(samples_per_channel as f64 / self.sample_rate as f64)
66 }
67}
68
69#[derive(Debug, Clone, Copy, PartialEq, Eq)]
70pub enum ChannelMix {
72 Average,
74 First,
76}
77
78#[derive(Debug, Clone, PartialEq)]
79pub struct MonoSamples {
81 pub timestamp: Timestamp,
83 pub sample_rate: u32,
85 pub samples: Vec<f32>,
87}
88
89#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
90pub struct AudioFeaturePoint {
92 pub start_seconds: f32,
94 pub end_seconds: f32,
96 pub values: BTreeMap<String, f32>,
98}
99
100impl AudioFeaturePoint {
101 pub fn new(
103 start_seconds: f32,
104 end_seconds: f32,
105 values: BTreeMap<String, f32>,
106 ) -> Result<Self> {
107 let point = Self {
108 start_seconds,
109 end_seconds,
110 values,
111 };
112 point.validate()?;
113 Ok(point)
114 }
115
116 pub fn validate(&self) -> Result<()> {
118 validate_time_range(self.start_seconds, self.end_seconds, "audio feature point")?;
119 validate_feature_values(&self.values)
120 }
121}
122
123#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
124pub struct AudioFeatureSeries {
126 pub sample_rate: u32,
128 pub channels: u16,
130 pub frame_size: usize,
132 pub hop_size: usize,
134 pub points: Vec<AudioFeaturePoint>,
136}
137
138impl AudioFeatureSeries {
139 pub fn new(
141 sample_rate: u32,
142 channels: u16,
143 frame_size: usize,
144 hop_size: usize,
145 points: Vec<AudioFeaturePoint>,
146 ) -> Result<Self> {
147 let series = Self {
148 sample_rate,
149 channels,
150 frame_size,
151 hop_size,
152 points,
153 };
154 series.validate()?;
155 Ok(series)
156 }
157
158 pub fn validate(&self) -> Result<()> {
160 AudioFormatSpec::new(self.sample_rate, self.channels)?.frame_samples(self.frame_size)?;
161 FrameSpec::new(self.frame_size, self.hop_size)?;
162 let mut previous_start = 0.0_f32;
163 for point in &self.points {
164 point.validate()?;
165 if point.start_seconds < previous_start
166 && !nearly_equal(point.start_seconds, previous_start)
167 {
168 return Err(DetectError::InvalidArgument(
169 "audio feature points must be ordered by start time".to_string(),
170 ));
171 }
172 previous_start = point.start_seconds;
173 }
174 Ok(())
175 }
176
177 pub fn duration_seconds(&self) -> f32 {
179 self.points
180 .last()
181 .map(|point| point.end_seconds)
182 .unwrap_or(0.0)
183 }
184}
185
186#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
187pub struct AudioFeatureSummary {
189 pub sample_rate: u32,
191 pub duration_seconds: f32,
193 pub frame_count: usize,
195 pub metrics: BTreeMap<String, f32>,
197}
198
199impl AudioFeatureSummary {
200 pub fn new(
202 sample_rate: u32,
203 duration_seconds: f32,
204 frame_count: usize,
205 metrics: BTreeMap<String, f32>,
206 ) -> Result<Self> {
207 let summary = Self {
208 sample_rate,
209 duration_seconds,
210 frame_count,
211 metrics,
212 };
213 summary.validate()?;
214 Ok(summary)
215 }
216
217 pub fn validate(&self) -> Result<()> {
219 AudioFormatSpec::new(self.sample_rate, 1)?;
220 if !self.duration_seconds.is_finite() || self.duration_seconds < 0.0 {
221 return Err(DetectError::InvalidArgument(
222 "audio feature summary duration_seconds must be finite and non-negative"
223 .to_string(),
224 ));
225 }
226 validate_feature_values(&self.metrics)
227 }
228}
229
230impl MonoSamples {
231 pub fn duration_seconds(&self) -> f64 {
233 if self.sample_rate == 0 {
234 return 0.0;
235 }
236 self.samples.len() as f64 / self.sample_rate as f64
237 }
238}
239
240#[derive(Debug, Clone, PartialEq)]
241pub struct AudioWaveformBatchView<'a> {
243 pub sample_rate: u32,
245 tensor: F32TensorView<'a>,
246}
247
248impl<'a> AudioWaveformBatchView<'a> {
249 pub fn new(sample_rate: u32, tensor: F32TensorView<'a>) -> Result<Self> {
251 let batch = Self {
252 sample_rate,
253 tensor,
254 };
255 batch.validate()?;
256 Ok(batch)
257 }
258
259 pub fn from_dims(
261 sample_rate: u32,
262 dims: impl Into<Vec<usize>>,
263 values: &'a [f32],
264 ) -> Result<Self> {
265 Self::new(sample_rate, F32TensorView::from_dims(dims, values)?)
266 }
267
268 pub fn tensor(&self) -> &F32TensorView<'a> {
270 &self.tensor
271 }
272
273 pub fn batch_size(&self) -> usize {
275 self.tensor.shape().dimensions()[0]
276 }
277
278 pub fn channel_count(&self) -> usize {
280 self.tensor.shape().dimensions()[1]
281 }
282
283 pub fn time_steps(&self) -> usize {
285 self.tensor.shape().dimensions()[2]
286 }
287
288 pub fn duration_seconds(&self) -> f64 {
290 self.time_steps() as f64 / self.sample_rate as f64
291 }
292
293 pub fn waveform(&self, batch_index: usize, channel_index: usize) -> Result<&'a [f32]> {
295 if batch_index >= self.batch_size() || channel_index >= self.channel_count() {
296 return Err(DetectError::InvalidArgument(format!(
297 "waveform index [{batch_index}, {channel_index}] is out of bounds for [{}, {}]",
298 self.batch_size(),
299 self.channel_count()
300 )));
301 }
302 let time_steps = self.time_steps();
303 let start = batch_index * self.channel_count() * time_steps + channel_index * time_steps;
304 Ok(&self.tensor.values()[start..start + time_steps])
305 }
306
307 fn validate(&self) -> Result<()> {
308 AudioFormatSpec::new(self.sample_rate, 1)?;
309 self.tensor.validate()?;
310 if self.tensor.shape().rank() != 3 {
311 return Err(DetectError::InvalidArgument(
312 "audio waveform batches must use rank 3 [B,C,T] tensors".to_string(),
313 ));
314 }
315 Ok(())
316 }
317}
318
319#[derive(Debug, Clone, PartialEq)]
320pub struct OwnedAudioWaveformBatch {
322 pub sample_rate: u32,
324 tensor: F32Tensor,
325}
326
327impl OwnedAudioWaveformBatch {
328 pub fn new(sample_rate: u32, tensor: F32Tensor) -> Result<Self> {
330 let batch = Self {
331 sample_rate,
332 tensor,
333 };
334 batch.as_view()?;
335 Ok(batch)
336 }
337
338 pub fn from_audio_frames(frames: &[video_analysis_core::OwnedAudioFrame]) -> Result<Self> {
340 if frames.is_empty() {
341 return Err(DetectError::InvalidArgument(
342 "audio waveform batches must contain at least one frame".to_string(),
343 ));
344 }
345 let first = &frames[0];
346 let sample_rate = first.sample_rate;
347 let channels = first.channels as usize;
348 let time_steps = first.samples_per_channel();
349 let mut values = Vec::with_capacity(frames.len() * channels * time_steps);
350
351 for frame in frames {
352 if frame.sample_rate != sample_rate
353 || frame.channels as usize != channels
354 || frame.samples_per_channel() != time_steps
355 {
356 return Err(DetectError::InvalidArgument(
357 "all audio frames in a batch must share sample rate, channel count, and samples per channel"
358 .to_string(),
359 ));
360 }
361 let normalized = normalized_samples(&frame.data);
362 for channel in 0..channels {
363 for time_index in 0..time_steps {
364 values.push(normalized[time_index * channels + channel]);
365 }
366 }
367 }
368
369 Self::new(
370 sample_rate,
371 F32Tensor::from_dims([frames.len(), channels, time_steps], values)?,
372 )
373 }
374
375 pub fn tensor(&self) -> &F32Tensor {
377 &self.tensor
378 }
379
380 pub fn as_view(&self) -> Result<AudioWaveformBatchView<'_>> {
382 AudioWaveformBatchView::new(self.sample_rate, self.tensor.as_view())
383 }
384}
385
386#[derive(Debug, Clone, Copy, PartialEq, Eq)]
387pub struct FrameSpec {
389 pub frame_size: usize,
391 pub hop_size: usize,
393}
394
395impl FrameSpec {
396 pub fn new(frame_size: usize, hop_size: usize) -> Result<Self> {
398 FrameStride::new(frame_size, hop_size)?;
399 Ok(Self {
400 frame_size,
401 hop_size,
402 })
403 }
404
405 pub fn frames<'a>(&self, samples: &'a [f32]) -> AudioFrames<'a> {
407 AudioFrames {
408 samples,
409 spec: *self,
410 offset: 0,
411 }
412 }
413
414 pub fn frame_count(&self, samples_len: usize) -> usize {
416 FrameStride::from(*self).frame_count(samples_len)
417 }
418}
419
420impl From<FrameSpec> for FrameStride {
421 fn from(value: FrameSpec) -> Self {
422 Self {
423 frame_size: value.frame_size,
424 hop_size: value.hop_size,
425 }
426 }
427}
428
429impl TryFrom<FrameStride> for FrameSpec {
430 type Error = DetectError;
431
432 fn try_from(value: FrameStride) -> Result<Self> {
433 Self::new(value.frame_size, value.hop_size)
434 }
435}
436
437#[derive(Debug, Clone)]
438pub struct AudioFrames<'a> {
440 samples: &'a [f32],
441 spec: FrameSpec,
442 offset: usize,
443}
444
445impl<'a> Iterator for AudioFrames<'a> {
446 type Item = (usize, &'a [f32]);
447
448 fn next(&mut self) -> Option<Self::Item> {
449 let end = self.offset.checked_add(self.spec.frame_size)?;
450 if end > self.samples.len() {
451 return None;
452 }
453 let offset = self.offset;
454 self.offset += self.spec.hop_size;
455 Some((offset, &self.samples[offset..end]))
456 }
457}
458
459#[derive(Debug, Clone, Copy, PartialEq, Eq)]
460pub struct StreamingFrameConfig {
462 pub frame_size: usize,
464 pub hop_size: usize,
466 pub channel_mix: ChannelMix,
468 pub max_buffered_samples: usize,
470}
471
472impl StreamingFrameConfig {
473 pub fn new(frame_size: usize, hop_size: usize) -> Result<Self> {
475 FrameSpec::new(frame_size, hop_size)?;
476 Ok(Self {
477 frame_size,
478 hop_size,
479 channel_mix: ChannelMix::Average,
480 max_buffered_samples: frame_size.saturating_add(hop_size).max(frame_size),
481 })
482 }
483
484 pub fn channel_mix(mut self, mix: ChannelMix) -> Self {
486 self.channel_mix = mix;
487 self
488 }
489
490 pub fn max_buffered_samples(mut self, samples: usize) -> Self {
492 self.max_buffered_samples = samples.max(self.frame_size);
493 self
494 }
495}
496
497#[derive(Debug, Clone, PartialEq)]
498pub struct AudioWindow {
500 pub timestamp: Timestamp,
502 pub sample_rate: u32,
504 pub start_sample: u64,
506 pub samples: Vec<f32>,
508}
509
510#[derive(Debug, Clone, PartialEq)]
511pub struct StreamingFrameBuffer {
513 config: StreamingFrameConfig,
514 sample_rate: Option<u32>,
515 channels: Option<u16>,
516 buffer: Vec<f32>,
517 buffered_start_sample: u64,
518 next_window_start_sample: Option<u64>,
519}
520
521impl StreamingFrameBuffer {
522 pub fn new(config: StreamingFrameConfig) -> Result<Self> {
524 FrameSpec::new(config.frame_size, config.hop_size)?;
525 if config.max_buffered_samples < config.frame_size {
526 return Err(DetectError::InvalidArgument(
527 "max_buffered_samples must be at least frame_size".to_string(),
528 ));
529 }
530 Ok(Self {
531 config,
532 sample_rate: None,
533 channels: None,
534 buffer: Vec::new(),
535 buffered_start_sample: 0,
536 next_window_start_sample: None,
537 })
538 }
539
540 pub fn push_frame(&mut self, frame: &AudioFrame<'_>) -> Result<Vec<AudioWindow>> {
542 self.validate_stream_format(frame)?;
543 let frame_start_sample = timestamp_to_sample(frame.timestamp, frame.sample_rate)?;
544 if self.next_window_start_sample.is_none() {
545 self.buffered_start_sample = frame_start_sample;
546 self.next_window_start_sample = Some(frame_start_sample);
547 }
548
549 let buffered_end_sample = self.buffered_start_sample + self.buffer.len() as u64;
550 if frame_start_sample > buffered_end_sample {
551 self.buffer.clear();
552 self.buffered_start_sample = frame_start_sample;
553 self.next_window_start_sample = Some(frame_start_sample);
554 } else if frame_start_sample < buffered_end_sample {
555 return Err(DetectError::InvalidArgument(
556 "streaming audio frames must not overlap".to_string(),
557 ));
558 }
559
560 self.buffer.extend(interleaved_to_mono(
561 frame.data,
562 frame.channels,
563 self.config.channel_mix,
564 )?);
565
566 let mut windows = Vec::new();
567 let mut next_start = self
568 .next_window_start_sample
569 .expect("next window start is initialized above");
570 let buffered_end_sample = self.buffered_start_sample + self.buffer.len() as u64;
571 while next_start + self.config.frame_size as u64 <= buffered_end_sample {
572 let offset = (next_start - self.buffered_start_sample) as usize;
573 let end = offset + self.config.frame_size;
574 windows.push(AudioWindow {
575 timestamp: sample_to_timestamp(next_start, frame.sample_rate),
576 sample_rate: frame.sample_rate,
577 start_sample: next_start,
578 samples: self.buffer[offset..end].to_vec(),
579 });
580 next_start += self.config.hop_size as u64;
581 }
582 self.next_window_start_sample = Some(next_start);
583 self.trim_consumed();
584 self.enforce_buffer_bound()?;
585 Ok(windows)
586 }
587
588 pub fn reset(&mut self) {
590 self.sample_rate = None;
591 self.channels = None;
592 self.buffer.clear();
593 self.buffered_start_sample = 0;
594 self.next_window_start_sample = None;
595 }
596
597 pub fn buffered_samples(&self) -> usize {
599 self.buffer.len()
600 }
601
602 fn validate_stream_format(&mut self, frame: &AudioFrame<'_>) -> Result<()> {
603 match (self.sample_rate, self.channels) {
604 (None, None) => {
605 self.sample_rate = Some(frame.sample_rate);
606 self.channels = Some(frame.channels);
607 Ok(())
608 }
609 (Some(sample_rate), Some(channels))
610 if sample_rate == frame.sample_rate && channels == frame.channels =>
611 {
612 Ok(())
613 }
614 _ => Err(DetectError::InvalidArgument(
615 "streaming audio sample_rate and channels must remain stable".to_string(),
616 )),
617 }
618 }
619
620 fn trim_consumed(&mut self) {
621 let Some(next_start) = self.next_window_start_sample else {
622 return;
623 };
624 if next_start <= self.buffered_start_sample {
625 return;
626 }
627 let drop = (next_start - self.buffered_start_sample).min(self.buffer.len() as u64) as usize;
628 if drop > 0 {
629 self.buffer.drain(0..drop);
630 self.buffered_start_sample += drop as u64;
631 }
632 }
633
634 fn enforce_buffer_bound(&mut self) -> Result<()> {
635 if self.buffer.len() <= self.config.max_buffered_samples {
636 return Ok(());
637 }
638 Err(DetectError::InvalidArgument(format!(
639 "streaming audio buffer exceeded max_buffered_samples ({})",
640 self.config.max_buffered_samples
641 )))
642 }
643}
644
645pub fn mono_samples(frame: &AudioFrame<'_>) -> Result<MonoSamples> {
647 mono_samples_with_mix(frame, ChannelMix::Average)
648}
649
650pub fn mono_samples_with_mix(frame: &AudioFrame<'_>, mix: ChannelMix) -> Result<MonoSamples> {
652 let samples = interleaved_to_mono(frame.data, frame.channels, mix)?;
653 Ok(MonoSamples {
654 timestamp: frame.timestamp,
655 sample_rate: frame.sample_rate,
656 samples,
657 })
658}
659
660pub fn interleaved_to_mono(
662 buffer: &AudioBuffer,
663 channels: u16,
664 mix: ChannelMix,
665) -> Result<Vec<f32>> {
666 if channels == 0 {
667 return Err(DetectError::InvalidAudioFormat {
668 sample_rate: 1,
669 channels,
670 });
671 }
672 let channels = channels as usize;
673 if !buffer.len().is_multiple_of(channels) {
674 return Err(DetectError::InvalidArgument(format!(
675 "audio buffer length {} is not divisible by channel count {channels}",
676 buffer.len()
677 )));
678 }
679 let normalized = normalized_samples(buffer);
680 Ok(match mix {
681 ChannelMix::First => normalized
682 .chunks_exact(channels)
683 .map(|frame| frame[0])
684 .collect(),
685 ChannelMix::Average => normalized
686 .chunks_exact(channels)
687 .map(|frame| frame.iter().sum::<f32>() / channels as f32)
688 .collect(),
689 })
690}
691
692pub fn normalized_samples(buffer: &AudioBuffer) -> Vec<f32> {
694 match buffer {
695 AudioBuffer::U8(values) => values
696 .iter()
697 .map(|value| (*value as f32 - 128.0) / 128.0)
698 .collect(),
699 AudioBuffer::I16(values) => values
700 .iter()
701 .map(|value| *value as f32 / i16::MAX as f32)
702 .collect(),
703 AudioBuffer::I32(values) => values
704 .iter()
705 .map(|value| *value as f32 / i32::MAX as f32)
706 .collect(),
707 AudioBuffer::F32(values) => values.clone(),
708 }
709}
710
711pub fn rms(samples: &[f32]) -> f32 {
713 if samples.is_empty() {
714 return 0.0;
715 }
716 (samples.iter().map(|sample| sample * sample).sum::<f32>() / samples.len() as f32).sqrt()
717}
718
719pub fn peak(samples: &[f32]) -> f32 {
721 samples
722 .iter()
723 .map(|sample| sample.abs())
724 .fold(0.0_f32, f32::max)
725}
726
727pub fn mean_absolute(samples: &[f32]) -> f32 {
729 if samples.is_empty() {
730 return 0.0;
731 }
732 samples.iter().map(|sample| sample.abs()).sum::<f32>() / samples.len() as f32
733}
734
735pub fn zero_crossing_rate(samples: &[f32]) -> f32 {
737 if samples.len() < 2 {
738 return 0.0;
739 }
740 let crossings = samples
741 .windows(2)
742 .filter(|pair| pair[0].is_sign_positive() != pair[1].is_sign_positive())
743 .filter(|pair| pair[0] != 0.0 && pair[1] != 0.0)
744 .count();
745 crossings as f32 / (samples.len() - 1) as f32
746}
747
748pub fn windowed_level_series(
750 samples: &[f32],
751 sample_rate: u32,
752 frame_spec: FrameSpec,
753) -> Result<AudioFeatureSeries> {
754 AudioFormatSpec::new(sample_rate, 1)?;
755 FrameSpec::new(frame_spec.frame_size, frame_spec.hop_size)?;
756 validate_samples(samples)?;
757 let mut points = Vec::with_capacity(frame_spec.frame_count(samples.len()));
758 for (start_sample, frame) in frame_spec.frames(samples) {
759 let end_sample = start_sample + frame.len();
760 let mut values = BTreeMap::new();
761 values.insert("rms".to_string(), rms(frame));
762 values.insert("peak".to_string(), peak(frame));
763 values.insert("meanAbsolute".to_string(), mean_absolute(frame));
764 values.insert("zeroCrossingRate".to_string(), zero_crossing_rate(frame));
765 points.push(AudioFeaturePoint::new(
766 start_sample as f32 / sample_rate as f32,
767 end_sample as f32 / sample_rate as f32,
768 values,
769 )?);
770 }
771 AudioFeatureSeries::new(
772 sample_rate,
773 1,
774 frame_spec.frame_size,
775 frame_spec.hop_size,
776 points,
777 )
778}
779
780pub fn summarize_feature_series(series: &AudioFeatureSeries) -> Result<AudioFeatureSummary> {
782 series.validate()?;
783 let mut names = BTreeSet::new();
784 for point in &series.points {
785 names.extend(point.values.keys().cloned());
786 }
787
788 let mut metrics = BTreeMap::new();
789 for name in names {
790 let values = series
791 .points
792 .iter()
793 .filter_map(|point| point.values.get(&name).copied())
794 .collect::<Vec<_>>();
795 if values.is_empty() {
796 continue;
797 }
798 let mean = values.iter().sum::<f32>() / values.len() as f32;
799 let max = values.iter().copied().fold(f32::NEG_INFINITY, f32::max);
800 metrics.insert(format!("{name}.mean"), mean);
801 metrics.insert(format!("{name}.max"), max);
802 }
803
804 AudioFeatureSummary::new(
805 series.sample_rate,
806 series.duration_seconds(),
807 series.points.len(),
808 metrics,
809 )
810}
811
812pub fn zero_pad_to(mut samples: Vec<f32>, target_len: usize) -> Vec<f32> {
814 samples.resize(target_len, 0.0);
815 samples
816}
817
818pub fn seconds_to_samples(seconds: f64, sample_rate: u32) -> Result<u64> {
820 AudioFormatSpec::new(sample_rate, 1)?;
821 if !seconds.is_finite() || seconds < 0.0 {
822 return Err(DetectError::InvalidArgument(
823 "audio duration must be a finite non-negative value".to_string(),
824 ));
825 }
826 let samples = seconds * sample_rate as f64;
827 if !samples.is_finite() || samples < 0.0 {
828 return Err(DetectError::InvalidArgument(
829 "audio timestamp must resolve to a finite non-negative sample index".to_string(),
830 ));
831 }
832 Ok(samples.round() as u64)
833}
834
835pub fn samples_to_seconds(samples: u64, sample_rate: u32) -> Result<f64> {
837 AudioFormatSpec::new(sample_rate, 1)?;
838 Ok(samples as f64 / sample_rate as f64)
839}
840
841pub fn timestamp_to_sample(timestamp: Timestamp, sample_rate: u32) -> Result<u64> {
843 if timestamp.timebase.den == 0 {
844 return Err(DetectError::InvalidAudioFormat {
845 sample_rate,
846 channels: 1,
847 });
848 }
849 seconds_to_samples(timestamp.seconds(), sample_rate)
850}
851
852pub fn sample_to_timestamp(sample: u64, sample_rate: u32) -> Timestamp {
854 Timestamp::new(sample as i64, Timebase::new(1, sample_rate as i32))
855}
856
857fn validate_time_range(start_seconds: f32, end_seconds: f32, label: &str) -> Result<()> {
858 if !start_seconds.is_finite() || start_seconds < 0.0 {
859 return Err(DetectError::InvalidArgument(format!(
860 "{label} start_seconds must be finite and non-negative"
861 )));
862 }
863 if !end_seconds.is_finite() || end_seconds < 0.0 {
864 return Err(DetectError::InvalidArgument(format!(
865 "{label} end_seconds must be finite and non-negative"
866 )));
867 }
868 if end_seconds < start_seconds {
869 return Err(DetectError::InvalidArgument(format!(
870 "{label} end_seconds must be greater than or equal to start_seconds"
871 )));
872 }
873 Ok(())
874}
875
876fn validate_feature_values(values: &BTreeMap<String, f32>) -> Result<()> {
877 for (name, value) in values {
878 if name.trim().is_empty() {
879 return Err(DetectError::InvalidArgument(
880 "audio feature names must not be empty".to_string(),
881 ));
882 }
883 if !value.is_finite() {
884 return Err(DetectError::InvalidArgument(format!(
885 "audio feature `{name}` must be finite"
886 )));
887 }
888 }
889 Ok(())
890}
891
892fn validate_samples(samples: &[f32]) -> Result<()> {
893 for sample in samples {
894 if !sample.is_finite() {
895 return Err(DetectError::InvalidArgument(
896 "audio samples must contain only finite values".to_string(),
897 ));
898 }
899 }
900 Ok(())
901}
902
903fn nearly_equal(left: f32, right: f32) -> bool {
904 (left - right).abs() <= f32::EPSILON * 16.0
905}
906
907#[cfg(test)]
908mod tests {
909 use super::*;
910 use proptest::prelude::*;
911 use video_analysis_core::{AudioBuffer, AudioFrame, Timebase, Timestamp};
912
913 fn assert_approx_eq(actual: f32, expected: f32, tolerance: f32) {
914 assert!(
915 (actual - expected).abs() <= tolerance,
916 "expected {actual} to be within {tolerance} of {expected}"
917 );
918 }
919
920 fn assert_approx_slice(actual: &[f32], expected: &[f32], tolerance: f32) {
921 assert_eq!(actual.len(), expected.len(), "slice lengths differ");
922 for (index, (actual, expected)) in actual.iter().zip(expected).enumerate() {
923 assert!(
924 (*actual - *expected).abs() <= tolerance,
925 "index {index}: expected {actual} to be within {tolerance} of {expected}"
926 );
927 }
928 }
929
930 fn ts() -> Timestamp {
931 Timestamp::new(0, Timebase::new(1, 48_000))
932 }
933
934 fn frame_at(sample: u64, samples: Vec<f32>) -> AudioBuffer {
935 let _ = sample;
936 AudioBuffer::F32(samples)
937 }
938
939 #[test]
940 fn mixes_interleaved_stereo_to_mono() {
941 let buffer = AudioBuffer::F32(vec![1.0, -1.0, 0.5, 0.25]);
942 let mono = interleaved_to_mono(&buffer, 2, ChannelMix::Average).unwrap();
943 assert_eq!(mono, vec![0.0, 0.375]);
944 }
945
946 #[test]
947 fn normalizes_all_supported_sample_formats() {
948 assert_approx_slice(
949 &normalized_samples(&AudioBuffer::U8(vec![0, 128, 255])),
950 &[-1.0, 0.0, 127.0 / 128.0],
951 1.0e-6,
952 );
953 assert_approx_slice(
954 &normalized_samples(&AudioBuffer::I16(vec![i16::MIN, 0, i16::MAX])),
955 &[i16::MIN as f32 / i16::MAX as f32, 0.0, 1.0],
956 1.0e-6,
957 );
958 assert_approx_slice(
959 &normalized_samples(&AudioBuffer::I32(vec![i32::MIN, 0, i32::MAX])),
960 &[i32::MIN as f32 / i32::MAX as f32, 0.0, 1.0],
961 1.0e-6,
962 );
963 assert_eq!(
964 normalized_samples(&AudioBuffer::F32(vec![-0.25, 0.0, 0.5])),
965 vec![-0.25, 0.0, 0.5]
966 );
967 }
968
969 #[test]
970 fn first_channel_mix_uses_first_interleaved_sample() {
971 let buffer = AudioBuffer::F32(vec![1.0, -1.0, 0.5, 0.25]);
972 let mono = interleaved_to_mono(&buffer, 2, ChannelMix::First).unwrap();
973 assert_eq!(mono, vec![1.0, 0.5]);
974 }
975
976 #[test]
977 fn batches_existing_audio_frames_into_channel_major_waveforms() {
978 let first = video_analysis_core::OwnedAudioFrame::new(
979 ts(),
980 48_000,
981 1,
982 AudioBuffer::F32(vec![0.1, 0.2]),
983 )
984 .unwrap();
985 let second = video_analysis_core::OwnedAudioFrame::new(
986 ts(),
987 48_000,
988 1,
989 AudioBuffer::F32(vec![0.3, 0.4]),
990 )
991 .unwrap();
992
993 let batch = OwnedAudioWaveformBatch::from_audio_frames(&[first, second]).unwrap();
994 let view = batch.as_view().unwrap();
995 assert_eq!(view.batch_size(), 2);
996 assert_eq!(view.waveform(1, 0).unwrap(), &[0.3, 0.4]);
997 }
998
999 #[test]
1000 fn mono_mix_rejects_invalid_channel_layouts() {
1001 assert!(interleaved_to_mono(&AudioBuffer::F32(vec![1.0]), 0, ChannelMix::Average).is_err());
1002 assert!(interleaved_to_mono(
1003 &AudioBuffer::F32(vec![1.0, 2.0, 3.0]),
1004 2,
1005 ChannelMix::Average
1006 )
1007 .is_err());
1008 }
1009
1010 #[test]
1011 fn frame_spec_validates_sizes_and_counts_frames() {
1012 assert!(FrameSpec::new(0, 1).is_err());
1013 assert!(FrameSpec::new(4, 0).is_err());
1014 let spec = FrameSpec::new(4, 2).unwrap();
1015 assert_eq!(spec.frame_count(3), 0);
1016 assert_eq!(spec.frame_count(4), 1);
1017 assert_eq!(spec.frame_count(6), 2);
1018 assert_eq!(spec.frame_count(7), 2);
1019 }
1020
1021 #[test]
1022 fn frame_spec_iterates_over_hops() {
1023 let spec = FrameSpec::new(4, 2).unwrap();
1024 let samples = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0];
1025 let frames = spec.frames(&samples).collect::<Vec<_>>();
1026 assert_eq!(frames.len(), 2);
1027 assert_eq!(frames[0], (0, &[0.0, 1.0, 2.0, 3.0][..]));
1028 assert_eq!(frames[1], (2, &[2.0, 3.0, 4.0, 5.0][..]));
1029 }
1030
1031 #[test]
1032 fn feature_contracts_validate_ranges_and_values() {
1033 assert!(AudioFeaturePoint::new(1.0, 0.5, BTreeMap::new()).is_err());
1034 assert!(AudioFeaturePoint::new(f32::NAN, 1.0, BTreeMap::new()).is_err());
1035
1036 let mut values = BTreeMap::new();
1037 values.insert("rms".to_string(), f32::INFINITY);
1038 assert!(AudioFeaturePoint::new(0.0, 1.0, values).is_err());
1039
1040 let point =
1041 AudioFeaturePoint::new(0.0, 0.5, BTreeMap::from([("rms".to_string(), 0.25)])).unwrap();
1042 assert!(AudioFeatureSeries::new(0, 1, 128, 64, vec![point.clone()]).is_err());
1043 assert!(AudioFeatureSeries::new(48_000, 1, 0, 64, vec![point.clone()]).is_err());
1044 assert!(AudioFeatureSummary::new(
1045 48_000,
1046 f32::NAN,
1047 1,
1048 BTreeMap::from([("rms.mean".to_string(), 0.25)])
1049 )
1050 .is_err());
1051 }
1052
1053 #[test]
1054 fn windowed_level_series_summarizes_deterministic_metrics() {
1055 let series =
1056 windowed_level_series(&[0.0, 1.0, -1.0, 0.0], 4, FrameSpec::new(2, 1).unwrap())
1057 .unwrap();
1058 assert_eq!(series.points.len(), 3);
1059 assert_eq!(series.points[0].start_seconds, 0.0);
1060 assert_eq!(series.points[0].end_seconds, 0.5);
1061 assert_approx_eq(series.points[0].values["rms"], 0.5_f32.sqrt(), 1.0e-6);
1062 assert_approx_eq(series.points[1].values["zeroCrossingRate"], 1.0, 1.0e-6);
1063
1064 let summary = summarize_feature_series(&series).unwrap();
1065 assert_eq!(summary.sample_rate, 4);
1066 assert_eq!(summary.frame_count, 3);
1067 assert_approx_eq(summary.duration_seconds, 1.0, 1.0e-6);
1068 assert!(summary.metrics["rms.mean"] > 0.0);
1069 assert_eq!(zero_crossing_rate(&[0.0, 1.0, 0.0]), 0.0);
1070 }
1071
1072 #[test]
1073 fn audio_frame_to_mono_preserves_timing() {
1074 let buffer = AudioBuffer::I16(vec![0, i16::MAX]);
1075 let frame = AudioFrame::new(ts(), 48_000, 1, &buffer).unwrap();
1076 let mono = mono_samples(&frame).unwrap();
1077 assert_eq!(mono.timestamp, ts());
1078 assert_eq!(mono.sample_rate, 48_000);
1079 assert_eq!(mono.samples, vec![0.0, 1.0]);
1080 }
1081
1082 #[test]
1083 fn hann_window_tapers_edges() {
1084 let windowed = WindowFunction::Hann.apply(&[1.0, 1.0, 1.0, 1.0]);
1085 assert!(windowed[0].abs() < 0.000_001);
1086 assert!(windowed[1] > 0.7);
1087 assert!(windowed[2] > 0.7);
1088 assert!(windowed[3].abs() < 0.000_001);
1089 }
1090
1091 #[test]
1092 fn streaming_buffer_emits_windows_inside_one_chunk() {
1093 let config = StreamingFrameConfig::new(4, 2).unwrap();
1094 let mut buffer = StreamingFrameBuffer::new(config).unwrap();
1095 let samples = AudioBuffer::F32(vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0]);
1096 let frame = AudioFrame::new(ts(), 48_000, 1, &samples).unwrap();
1097
1098 let windows = buffer.push_frame(&frame).unwrap();
1099
1100 assert_eq!(windows.len(), 2);
1101 assert_eq!(windows[0].start_sample, 0);
1102 assert_eq!(windows[0].samples, vec![0.0, 1.0, 2.0, 3.0]);
1103 assert_eq!(windows[1].start_sample, 2);
1104 assert_eq!(windows[1].samples, vec![2.0, 3.0, 4.0, 5.0]);
1105 }
1106
1107 #[test]
1108 fn streaming_buffer_emits_windows_across_chunks() {
1109 let config = StreamingFrameConfig::new(4, 2).unwrap();
1110 let mut buffer = StreamingFrameBuffer::new(config).unwrap();
1111 let first = AudioBuffer::F32(vec![0.0, 1.0, 2.0]);
1112 let second = AudioBuffer::F32(vec![3.0, 4.0, 5.0]);
1113 let first_frame = AudioFrame::new(ts(), 48_000, 1, &first).unwrap();
1114 let second_frame = AudioFrame::new(
1115 Timestamp::new(3, Timebase::new(1, 48_000)),
1116 48_000,
1117 1,
1118 &second,
1119 )
1120 .unwrap();
1121
1122 assert!(buffer.push_frame(&first_frame).unwrap().is_empty());
1123 let windows = buffer.push_frame(&second_frame).unwrap();
1124
1125 assert_eq!(windows.len(), 2);
1126 assert_eq!(windows[0].samples, vec![0.0, 1.0, 2.0, 3.0]);
1127 assert_eq!(windows[1].samples, vec![2.0, 3.0, 4.0, 5.0]);
1128 }
1129
1130 #[test]
1131 fn streaming_buffer_preserves_window_timestamps() {
1132 let config = StreamingFrameConfig::new(4, 2).unwrap();
1133 let mut buffer = StreamingFrameBuffer::new(config).unwrap();
1134 let samples = AudioBuffer::F32(vec![0.0; 6]);
1135 let frame = AudioFrame::new(
1136 Timestamp::new(10, Timebase::new(1, 48_000)),
1137 48_000,
1138 1,
1139 &samples,
1140 )
1141 .unwrap();
1142
1143 let windows = buffer.push_frame(&frame).unwrap();
1144
1145 assert_eq!(
1146 windows
1147 .iter()
1148 .map(|window| window.timestamp)
1149 .collect::<Vec<_>>(),
1150 vec![
1151 Timestamp::new(10, Timebase::new(1, 48_000)),
1152 Timestamp::new(12, Timebase::new(1, 48_000)),
1153 ]
1154 );
1155 }
1156
1157 #[test]
1158 fn streaming_buffer_requires_stable_format() {
1159 let config = StreamingFrameConfig::new(4, 2).unwrap();
1160 let mut buffer = StreamingFrameBuffer::new(config).unwrap();
1161 let first = AudioBuffer::F32(vec![0.0; 4]);
1162 let second = AudioBuffer::F32(vec![0.0; 4]);
1163 let first_frame = AudioFrame::new(ts(), 48_000, 1, &first).unwrap();
1164 let second_frame = AudioFrame::new(
1165 Timestamp::new(4, Timebase::new(1, 44_100)),
1166 44_100,
1167 1,
1168 &second,
1169 )
1170 .unwrap();
1171
1172 buffer.push_frame(&first_frame).unwrap();
1173
1174 assert!(buffer.push_frame(&second_frame).is_err());
1175 }
1176
1177 #[test]
1178 fn streaming_buffer_keeps_retained_samples_bounded() {
1179 let config = StreamingFrameConfig::new(8, 8)
1180 .unwrap()
1181 .max_buffered_samples(8);
1182 let mut buffer = StreamingFrameBuffer::new(config).unwrap();
1183 let samples = AudioBuffer::F32(vec![0.0; 32]);
1184 let frame = AudioFrame::new(ts(), 48_000, 1, &samples).unwrap();
1185
1186 assert!(buffer.push_frame(&frame).is_ok());
1187 assert!(buffer.buffered_samples() <= 8);
1188 }
1189
1190 #[test]
1191 fn streaming_buffer_reset_allows_new_format() {
1192 let config = StreamingFrameConfig::new(4, 2).unwrap();
1193 let mut buffer = StreamingFrameBuffer::new(config).unwrap();
1194 let first = AudioBuffer::F32(vec![0.0; 4]);
1195 let second = AudioBuffer::F32(vec![0.0; 4]);
1196 buffer
1197 .push_frame(&AudioFrame::new(ts(), 48_000, 1, &first).unwrap())
1198 .unwrap();
1199 buffer.reset();
1200 assert!(buffer
1201 .push_frame(
1202 &AudioFrame::new(
1203 Timestamp::new(0, Timebase::new(1, 44_100)),
1204 44_100,
1205 1,
1206 &second
1207 )
1208 .unwrap()
1209 )
1210 .is_ok());
1211 }
1212
1213 proptest! {
1214 #[test]
1215 fn generated_interleaved_mono_length_matches_samples_per_channel(
1216 channels in 1_u16..=8,
1217 frames in 0_usize..64,
1218 samples in proptest::collection::vec(-1.0_f32..1.0, 0..512),
1219 ) {
1220 let channels = channels as usize;
1221 let len = frames * channels;
1222 let mut values = samples;
1223 values.resize(len, 0.0);
1224 let mono = interleaved_to_mono(&AudioBuffer::F32(values), channels as u16, ChannelMix::Average).unwrap();
1225 prop_assert_eq!(mono.len(), frames);
1226 }
1227
1228 #[test]
1229 fn streaming_windows_do_not_depend_on_chunk_partition(
1230 len in 16_usize..96,
1231 chunk_size in 1_usize..24,
1232 ) {
1233 let samples = (0..len).map(|value| value as f32).collect::<Vec<_>>();
1234 let config = StreamingFrameConfig::new(8, 4).unwrap();
1235
1236 let all_buffer = AudioBuffer::F32(samples.clone());
1237 let all_frame = AudioFrame::new(ts(), 48_000, 1, &all_buffer).unwrap();
1238 let mut all = StreamingFrameBuffer::new(config).unwrap();
1239 let expected = all.push_frame(&all_frame).unwrap();
1240
1241 let mut chunked = StreamingFrameBuffer::new(config).unwrap();
1242 let mut actual = Vec::new();
1243 let mut start = 0;
1244 while start < samples.len() {
1245 let end = (start + chunk_size).min(samples.len());
1246 let buffer = frame_at(start as u64, samples[start..end].to_vec());
1247 let frame = AudioFrame::new(
1248 Timestamp::new(start as i64, Timebase::new(1, 48_000)),
1249 48_000,
1250 1,
1251 &buffer,
1252 )
1253 .unwrap();
1254 actual.extend(chunked.push_frame(&frame).unwrap());
1255 start = end;
1256 }
1257
1258 prop_assert_eq!(actual, expected);
1259 }
1260 }
1261
1262 #[test]
1263 fn scalar_level_helpers_are_empty_safe() {
1264 assert_approx_eq(rms(&[1.0, -1.0]), 1.0, 1.0e-6);
1265 assert_eq!(peak(&[]), 0.0);
1266 assert_eq!(mean_absolute(&[]), 0.0);
1267 }
1268
1269 #[test]
1270 fn audio_format_spec_validates_and_reports_duration() {
1271 let spec = AudioFormatSpec::new(48_000, 2)
1272 .unwrap()
1273 .frame_samples(2_048)
1274 .unwrap();
1275 assert_eq!(spec.duration_seconds(4_800).unwrap(), 0.1);
1276 assert!(AudioFormatSpec::new(0, 2).is_err());
1277 assert!(AudioFormatSpec::new(48_000, 0).is_err());
1278 assert!(AudioFormatSpec::new(48_000, 2)
1279 .unwrap()
1280 .frame_samples(0)
1281 .is_err());
1282 }
1283
1284 #[test]
1285 fn sample_and_timestamp_helpers_round_trip() {
1286 let timestamp = Timestamp::new(2_205, Timebase::new(1, 44_100));
1287 let sample = timestamp_to_sample(timestamp, 44_100).unwrap();
1288 assert_eq!(sample, 2_205);
1289 assert_eq!(sample_to_timestamp(sample, 44_100), timestamp);
1290 assert_eq!(seconds_to_samples(0.5, 16_000).unwrap(), 8_000);
1291 assert_eq!(samples_to_seconds(8_000, 16_000).unwrap(), 0.5);
1292 assert!(seconds_to_samples(-1.0, 16_000).is_err());
1293 }
1294
1295 #[test]
1296 fn streaming_buffer_detects_overlapping_chunks() {
1297 let config = StreamingFrameConfig::new(4, 2).unwrap();
1298 let mut buffer = StreamingFrameBuffer::new(config).unwrap();
1299 let first = AudioBuffer::F32(vec![0.0, 1.0, 2.0, 3.0]);
1300 let second = AudioBuffer::F32(vec![2.0, 3.0, 4.0, 5.0]);
1301 let first_frame = AudioFrame::new(ts(), 48_000, 1, &first).unwrap();
1302 let overlapping = AudioFrame::new(
1303 Timestamp::new(2, Timebase::new(1, 48_000)),
1304 48_000,
1305 1,
1306 &second,
1307 )
1308 .unwrap();
1309 buffer.push_frame(&first_frame).unwrap();
1310 assert!(buffer.push_frame(&overlapping).is_err());
1311 }
1312}