Skip to main content

speech_prep/
converter.rs

1//! High-level audio format conversion pipeline.
2//!
3//! This module provides a unified API for converting arbitrary audio formats
4//! to standard format: mono, 16kHz, normalized f32 samples.
5//!
6//! ## Pipeline Stages
7//!
8//! 1. **Format Detection**: Identify audio container format (WAV, MP3, FLAC,
9//!    etc.)
10//! 2. **Decoding**: Extract PCM samples from container (supports 16/24-bit)
11//! 3. **Resampling**: Convert to 16kHz standard rate (linear interpolation)
12//! 4. **Channel Mixing**: Downmix to mono (simple averaging)
13//!
14//! ## Performance Contract
15//!
16//! - **Target Latency**: <10ms for 3-second audio clip
17//! - **Memory**: Streaming-friendly, minimal allocations
18//! - **Quality**: RMS error <0.01, zero clipping
19//!
20//! ## Example
21//!
22//! ```rust,no_run
23//! use speech_prep::converter::AudioFormatConverter;
24//!
25//! let audio_bytes = std::fs::read("recording.wav")?;
26//! let standard = AudioFormatConverter::convert_to_standard(&audio_bytes)?;
27//!
28//! println!(
29//!     "Converted {} samples from {} to mono 16kHz",
30//!     standard.samples.len(),
31//!     standard.metadata.original_format
32//! );
33//! # Ok::<(), Box<dyn std::error::Error>>(())
34//! ```
35
36use crate::error::{Error, Result};
37use crate::time::{AudioDuration, AudioInstant};
38
39use crate::decoder::{ChannelMixer, SampleRateConverter, WavDecoder};
40use crate::format::{AudioFormat, FormatDetector};
41
42/// Standardized audio output: mono, 16kHz, normalized samples.
43///
44/// This is the canonical format for all audio processing,
45/// designed to be consumed by downstream scoring and analysis, and other
46/// downstream components.
47#[derive(Debug, Clone, PartialEq)]
48pub struct StandardAudio {
49    /// Mono audio samples at 16kHz, normalized to [-1.0, 1.0].
50    pub samples: Vec<f32>,
51    /// Metadata tracking the conversion journey and quality metrics.
52    pub metadata: ConversionMetadata,
53}
54
55impl StandardAudio {
56    /// Total number of mono samples.
57    #[must_use]
58    pub fn sample_count(&self) -> usize {
59        self.samples.len()
60    }
61
62    /// Duration in seconds at 16kHz.
63    #[must_use]
64    pub fn duration_sec(&self) -> f64 {
65        self.samples.len() as f64 / 16000.0
66    }
67
68    /// Check if the audio is effectively silent (all samples near zero).
69    #[must_use]
70    pub fn is_silent(&self) -> bool {
71        self.samples.iter().all(|&s| s.abs() < 1e-4)
72    }
73}
74
75/// Metadata tracking the complete conversion pipeline journey.
76///
77/// Captures information from all 4 pipeline stages to enable debugging,
78/// quality validation, and observability.
79#[derive(Debug, Clone, Copy, PartialEq)]
80pub struct ConversionMetadata {
81    /// Detected audio format (WAV, MP3, FLAC, etc.).
82    pub original_format: AudioFormat,
83    /// Original sample rate in Hz before resampling.
84    pub original_sample_rate: u32,
85    /// Original number of channels before mixing.
86    pub original_channels: u8,
87    /// Original bit depth (if applicable, e.g., 16, 24 for PCM).
88    pub original_bit_depth: Option<u16>,
89    /// Peak amplitude in original audio before any processing.
90    pub peak_before: f32,
91    /// Peak amplitude after complete conversion pipeline.
92    pub peak_after: f32,
93    /// Total time spent in conversion pipeline (all 4 stages).
94    pub conversion_time_ms: f64,
95    /// Time spent in format detection stage.
96    pub detection_time_ms: f64,
97    /// Time spent in decoding stage.
98    pub decode_time_ms: f64,
99    /// Time spent in resampling stage.
100    pub resample_time_ms: f64,
101    /// Time spent in channel mixing stage.
102    pub mix_time_ms: f64,
103}
104
105impl ConversionMetadata {
106    /// Check if any stage exceeded expected latency budget.
107    ///
108    /// Expected budget breakdown for 3s clip:
109    /// - Detection: <1ms
110    /// - Decoding: <3ms
111    /// - Resampling: <5ms
112    /// - Mixing: <1ms
113    /// - **Total: <10ms**
114    #[must_use]
115    pub fn has_performance_issue(&self) -> bool {
116        self.conversion_time_ms > 10.0
117            || self.detection_time_ms > 1.0
118            || self.decode_time_ms > 3.0
119            || self.resample_time_ms > 5.0
120            || self.mix_time_ms > 1.0
121    }
122
123    /// Calculate the peak amplitude reduction ratio from conversion.
124    ///
125    /// Returns the ratio of final peak to original peak. Values:
126    /// - 1.0 = no amplitude change
127    /// - <1.0 = amplitude reduced (common with averaging/resampling)
128    /// - >1.0 = amplitude increased (rare, may indicate issue)
129    #[must_use]
130    pub fn peak_ratio(&self) -> f32 {
131        if self.peak_before.abs() < f32::EPSILON {
132            1.0 // Avoid division by zero for silent input
133        } else {
134            self.peak_after / self.peak_before
135        }
136    }
137}
138
139/// High-level audio format converter.
140///
141/// Provides a unified API for converting arbitrary audio formats to the
142/// standard format: mono, 16kHz, normalized f32 samples.
143///
144/// ## Pipeline Architecture
145///
146/// ```text
147/// Input Bytes
148///     ↓
149/// [Format Detection] ← 6 formats: WAV, MP3, FLAC, Opus, WebM, AAC
150///     ↓
151/// [WAV Decoding] ← 16/24-bit PCM normalization
152///     ↓
153/// [Resampling] ← Arbitrary rate → 16kHz (linear interpolation)
154///     ↓
155/// [Channel Mixing] ← Multi-channel → Mono (simple averaging)
156///     ↓
157/// StandardAudio (mono, 16kHz, f32)
158/// ```
159///
160/// ## Current Scope Limitations
161///
162/// - **Formats**: Only WAV decoding implemented; other formats detected but not
163///   decoded
164/// - **Channel Counts**: 1, 2, 4, 6 channels supported
165/// - **Bit Depths**: 16-bit and 24-bit PCM only
166/// - **Resampling**: Linear interpolation (sinc reserved for future)
167///
168/// ## Example
169///
170/// ```rust,no_run
171/// use speech_prep::converter::AudioFormatConverter;
172///
173/// // Convert any supported audio to standard format
174/// let wav_bytes = std::fs::read("audio.wav")?;
175/// let standard = AudioFormatConverter::convert_to_standard(&wav_bytes)?;
176///
177/// // Access standardized samples and metadata
178/// println!("Format: {}", standard.metadata.original_format);
179/// println!("Converted {} Hz → 16kHz", standard.metadata.original_sample_rate);
180/// println!("Converted {} ch → mono", standard.metadata.original_channels);
181/// println!("Conversion time: {:.2}ms", standard.metadata.conversion_time_ms);
182/// # Ok::<(), Box<dyn std::error::Error>>(())
183/// ```
184#[derive(Debug, Default, Clone, Copy)]
185pub struct AudioFormatConverter;
186
187impl AudioFormatConverter {
188    /// Create a new audio format converter instance.
189    #[must_use]
190    pub const fn new() -> Self {
191        Self
192    }
193
194    /// Convert arbitrary audio bytes to standard format: mono, 16kHz, f32.
195    ///
196    /// This is the primary entry point for the audio normalization pipeline.
197    /// It composes all 4 stages: format detection, decoding, resampling, and
198    /// channel mixing.
199    ///
200    /// # Arguments
201    ///
202    /// * `audio_bytes` - Raw audio file bytes (any supported format)
203    ///
204    /// # Returns
205    ///
206    /// `StandardAudio` with mono 16kHz samples and complete conversion
207    /// metadata.
208    ///
209    /// # Errors
210    ///
211    /// Returns `Error::InvalidInput` if:
212    /// - Format detection fails (not a recognized audio format)
213    /// - Format is detected but not WAV (only WAV decoding supported)
214    /// - WAV decoding fails (malformed file, unsupported codec)
215    /// - Resampling fails (invalid sample rates)
216    /// - Channel mixing fails (unsupported channel count)
217    ///
218    /// # Performance
219    ///
220    /// Target: <10ms for 3-second audio clip on reference hardware.
221    /// Actual timing captured in `ConversionMetadata.conversion_time_ms`.
222    ///
223    /// # Example
224    ///
225    /// ```rust,no_run
226    /// use speech_prep::converter::AudioFormatConverter;
227    ///
228    /// let audio_bytes = std::fs::read("recording.wav")?;
229    /// let standard = AudioFormatConverter::convert_to_standard(&audio_bytes)?;
230    ///
231    /// assert_eq!(standard.metadata.original_format.as_str(), "wav");
232    /// assert!(standard.samples.len() > 0);
233    /// # Ok::<(), Box<dyn std::error::Error>>(())
234    /// ```
235    #[allow(clippy::cognitive_complexity)] // Large function — audio format conversion pipeline
236    pub fn convert_to_standard(audio_bytes: &[u8]) -> Result<StandardAudio> {
237        let pipeline_start = AudioInstant::now();
238
239        tracing::debug!(
240            audio_bytes_len = audio_bytes.len(),
241            "Starting audio format conversion pipeline"
242        );
243
244        let detection_start = AudioInstant::now();
245        let format_metadata = FormatDetector::detect(audio_bytes)?;
246        let detection_duration = elapsed_since(detection_start);
247        let detection_time_ms = detection_duration.as_secs_f64() * 1000.0;
248
249        tracing::debug!(
250            format = %format_metadata.format,
251            detection_time_ms,
252            "Format detection complete"
253        );
254
255        if format_metadata.format != AudioFormat::WavPcm {
256            return Err(Error::InvalidInput(format!(
257                "unsupported format for decoding: {} (only WAV supported)",
258                format_metadata.format.as_str()
259            )));
260        }
261
262        let decode_start = AudioInstant::now();
263        let decoded = WavDecoder::decode(audio_bytes)?;
264        let decode_duration = elapsed_since(decode_start);
265        let decode_time_ms = decode_duration.as_secs_f64() * 1000.0;
266
267        tracing::debug!(
268            sample_rate = decoded.sample_rate,
269            channels = decoded.channels,
270            bit_depth = decoded.bit_depth,
271            sample_count = decoded.samples.len(),
272            decode_time_ms,
273            "WAV decoding complete"
274        );
275
276        let peak_before = decoded
277            .samples
278            .iter()
279            .map(|s| s.abs())
280            .fold(0.0f32, f32::max);
281
282        let resample_start = AudioInstant::now();
283        let resampled = SampleRateConverter::resample(
284            &decoded.samples,
285            decoded.channels,
286            decoded.sample_rate,
287            16000,
288        )?;
289        let resample_duration = elapsed_since(resample_start);
290        let resample_time_ms = resample_duration.as_secs_f64() * 1000.0;
291
292        tracing::debug!(
293            input_rate = decoded.sample_rate,
294            output_rate = 16000,
295            output_samples = resampled.len(),
296            resample_time_ms,
297            "Sample rate conversion complete"
298        );
299
300        let mix_start = AudioInstant::now();
301        let mixed = ChannelMixer::mix_to_mono(&resampled, decoded.channels)?;
302        let mix_duration = elapsed_since(mix_start);
303        let mix_time_ms = mix_duration.as_secs_f64() * 1000.0;
304
305        tracing::debug!(
306            input_channels = decoded.channels,
307            output_samples = mixed.samples.len(),
308            peak_before_mix = mixed.peak_before_mix,
309            peak_after_mix = mixed.peak_after_mix,
310            mix_time_ms,
311            "Channel mixing complete"
312        );
313
314        let conversion_duration = elapsed_since(pipeline_start);
315        let conversion_time_ms = conversion_duration.as_secs_f64() * 1000.0;
316
317        if conversion_time_ms > 10.0 {
318            tracing::warn!(
319                conversion_time_ms,
320                detection_time_ms,
321                decode_time_ms,
322                resample_time_ms,
323                mix_time_ms,
324                "Audio conversion exceeded 10ms target latency"
325            );
326        } else {
327            tracing::debug!(conversion_time_ms, "Audio conversion pipeline complete");
328        }
329
330        let metadata = ConversionMetadata {
331            original_format: format_metadata.format,
332            original_sample_rate: decoded.sample_rate,
333            original_channels: decoded.channels,
334            original_bit_depth: Some(decoded.bit_depth),
335            peak_before,
336            peak_after: mixed.peak_after_mix,
337            conversion_time_ms,
338            detection_time_ms,
339            decode_time_ms,
340            resample_time_ms,
341            mix_time_ms,
342        };
343
344        Ok(StandardAudio {
345            samples: mixed.samples,
346            metadata,
347        })
348    }
349}
350
351fn elapsed_since(start: AudioInstant) -> AudioDuration {
352    AudioInstant::now().duration_since(start)
353}
354
355#[cfg(test)]
356mod tests {
357    use super::*;
358
359    type TestResult<T> = std::result::Result<T, String>;
360
361    /// Create a minimal valid WAV file for testing.
362    fn create_test_wav(sample_rate: u32, channels: u16, samples: &[i16]) -> TestResult<Vec<u8>> {
363        let spec = hound::WavSpec {
364            sample_rate,
365            channels,
366            bits_per_sample: 16,
367            sample_format: hound::SampleFormat::Int,
368        };
369
370        let mut cursor = std::io::Cursor::new(Vec::new());
371        let mut writer = hound::WavWriter::new(&mut cursor, spec)
372            .map_err(|e| format!("failed to create WAV writer: {e}"))?;
373
374        for &sample in samples {
375            writer
376                .write_sample(sample)
377                .map_err(|e| format!("failed to write sample: {e}"))?;
378        }
379
380        writer
381            .finalize()
382            .map_err(|e| format!("failed to finalize WAV: {e}"))?;
383
384        Ok(cursor.into_inner())
385    }
386
387    #[test]
388    fn test_convert_mono_16khz_identity() -> TestResult<()> {
389        // Already in standard format: mono, 16kHz
390        let samples = vec![100i16, 200, -100, -200]; // Small amplitude
391        let wav = create_test_wav(16000, 1, &samples)?;
392
393        let standard =
394            AudioFormatConverter::convert_to_standard(&wav).map_err(|e| e.to_string())?;
395
396        // Should have 4 samples (already 16kHz mono)
397        assert_eq!(standard.samples.len(), 4);
398        assert_eq!(standard.metadata.original_sample_rate, 16000);
399        assert_eq!(standard.metadata.original_channels, 1);
400        assert_eq!(standard.metadata.original_format, AudioFormat::WavPcm);
401
402        Ok(())
403    }
404
405    #[test]
406    fn test_convert_stereo_44100_to_standard() -> TestResult<()> {
407        // Stereo 44.1kHz → mono 16kHz
408        let samples = vec![1000i16, -1000, 2000, -2000]; // 2 stereo frames
409        let wav = create_test_wav(44100, 2, &samples)?;
410
411        let standard =
412            AudioFormatConverter::convert_to_standard(&wav).map_err(|e| e.to_string())?;
413
414        // Should have ~1 sample after downsampling 44.1kHz → 16kHz and mixing stereo →
415        // mono Original: 2 frames at 44.1kHz = ~0.045ms
416        // At 16kHz: ~0.045ms * 16000 = ~0.72 samples (rounds to 1)
417        assert!(!standard.samples.is_empty());
418        assert_eq!(standard.metadata.original_sample_rate, 44100);
419        assert_eq!(standard.metadata.original_channels, 2);
420
421        Ok(())
422    }
423
424    #[test]
425    fn test_convert_tracks_timing() -> TestResult<()> {
426        let samples = vec![0i16; 1000]; // 1000 samples
427        let wav = create_test_wav(16000, 1, &samples)?;
428
429        let standard =
430            AudioFormatConverter::convert_to_standard(&wav).map_err(|e| e.to_string())?;
431
432        // All timing fields should be populated
433        assert!(standard.metadata.detection_time_ms >= 0.0);
434        assert!(standard.metadata.decode_time_ms >= 0.0);
435        assert!(standard.metadata.resample_time_ms >= 0.0);
436        assert!(standard.metadata.mix_time_ms >= 0.0);
437        assert!(standard.metadata.conversion_time_ms >= 0.0);
438
439        // Total time should be sum of stages (approximately, with measurement overhead)
440        let stage_sum = standard.metadata.detection_time_ms
441            + standard.metadata.decode_time_ms
442            + standard.metadata.resample_time_ms
443            + standard.metadata.mix_time_ms;
444
445        assert!(
446            (standard.metadata.conversion_time_ms - stage_sum).abs() < 1.0,
447            "total time {} should approximately equal stage sum {}",
448            standard.metadata.conversion_time_ms,
449            stage_sum
450        );
451
452        Ok(())
453    }
454
455    #[test]
456    fn test_convert_tracks_peaks() -> TestResult<()> {
457        // Create audio with known peak
458        let samples = vec![10000i16, -10000, 5000, -5000]; // Peak: 10000/32768 ≈ 0.305
459        let wav = create_test_wav(16000, 1, &samples)?;
460
461        let standard =
462            AudioFormatConverter::convert_to_standard(&wav).map_err(|e| e.to_string())?;
463
464        // Should track peaks
465        assert!(standard.metadata.peak_before > 0.0);
466        assert!(standard.metadata.peak_after > 0.0);
467
468        // Peak should be approximately 0.305
469        assert!(
470            (standard.metadata.peak_before - 0.305).abs() < 0.01,
471            "expected peak ~0.305, got {}",
472            standard.metadata.peak_before
473        );
474
475        Ok(())
476    }
477
478    #[test]
479    fn test_convert_rejects_non_wav() {
480        // Create fake MP3 header
481        let mp3_bytes = vec![0xFF, 0xFB, 0x90, 0x00]; // Valid MP3 frame start
482
483        let result = AudioFormatConverter::convert_to_standard(&mp3_bytes);
484
485        // Should detect as MP3 but reject for decoding (only WAV supported)
486        assert!(result.is_err());
487        if let Err(err) = result {
488            let err_msg = err.to_string();
489            assert!(err_msg.contains("MP3") || err_msg.contains("unsupported"));
490        }
491    }
492
493    #[test]
494    fn test_standard_audio_duration_calculation() -> TestResult<()> {
495        let samples = vec![0i16; 16000]; // 1 second at 16kHz
496        let wav = create_test_wav(16000, 1, &samples)?;
497
498        let standard =
499            AudioFormatConverter::convert_to_standard(&wav).map_err(|e| e.to_string())?;
500
501        // Should be 1.0 second
502        assert!((standard.duration_sec() - 1.0).abs() < 0.01);
503
504        Ok(())
505    }
506
507    #[test]
508    fn test_standard_audio_is_silent_detection() -> TestResult<()> {
509        let silent_samples = vec![0i16; 100];
510        let wav = create_test_wav(16000, 1, &silent_samples)?;
511
512        let standard =
513            AudioFormatConverter::convert_to_standard(&wav).map_err(|e| e.to_string())?;
514
515        assert!(standard.is_silent());
516
517        Ok(())
518    }
519
520    #[test]
521    fn test_conversion_metadata_peak_ratio() -> TestResult<()> {
522        let samples = vec![10000i16, -10000];
523        let wav = create_test_wav(16000, 1, &samples)?;
524
525        let standard =
526            AudioFormatConverter::convert_to_standard(&wav).map_err(|e| e.to_string())?;
527
528        // Peak ratio should be close to 1.0 for mono 16kHz (no mixing/resampling
529        // changes)
530        assert!(
531            (standard.metadata.peak_ratio() - 1.0).abs() < 0.1,
532            "expected peak ratio ~1.0, got {}",
533            standard.metadata.peak_ratio()
534        );
535
536        Ok(())
537    }
538}