Skip to main content

speech_prep/
converter.rs

1//! High-level audio format conversion pipeline.
2//!
3//! This module provides a unified API for detecting common audio formats and
4//! converting WAV/PCM input to the crate's standard format: mono, 16kHz,
5//! normalized f32 samples.
6//!
7//! ## Pipeline Stages
8//!
9//! 1. **Format Detection**: Identify audio container format (WAV, MP3, FLAC,
10//!    etc.)
11//! 2. **Decoding**: Extract PCM samples from WAV input (supports 16/24-bit)
12//! 3. **Resampling**: Convert to 16kHz standard rate (linear interpolation)
13//! 4. **Channel Mixing**: Downmix to mono (simple averaging)
14//!
15//! ## Performance Contract
16//!
17//! - **Target Latency**: <10ms for 3-second audio clip
18//! - **Memory**: Streaming-friendly, minimal allocations
19//! - **Quality**: RMS error <0.01, zero clipping
20//!
21//! ## Example
22//!
23//! ```rust,no_run
24//! use speech_prep::converter::AudioFormatConverter;
25//!
26//! let audio_bytes = std::fs::read("recording.wav")?;
27//! let standard = AudioFormatConverter::convert_to_standard(&audio_bytes)?;
28//!
29//! assert!(!standard.samples.is_empty());
30//! # Ok::<(), Box<dyn std::error::Error>>(())
31//! ```
32
33use crate::error::{Error, Result};
34use crate::time::{AudioDuration, AudioInstant};
35
36use crate::decoder::{ChannelMixer, SampleRateConverter, WavDecoder};
37use crate::format::{AudioFormat, FormatDetector};
38
39/// Standardized audio output: mono, 16kHz, normalized samples.
40///
41/// This is the canonical format used by the rest of the crate.
42#[derive(Debug, Clone, PartialEq)]
43pub struct StandardAudio {
44    /// Mono audio samples at 16kHz, normalized to [-1.0, 1.0].
45    pub samples: Vec<f32>,
46    /// Metadata tracking the conversion journey and quality metrics.
47    pub metadata: ConversionMetadata,
48}
49
50impl StandardAudio {
51    /// Total number of mono samples.
52    #[must_use]
53    pub fn sample_count(&self) -> usize {
54        self.samples.len()
55    }
56
57    /// Duration in seconds at 16kHz.
58    #[must_use]
59    pub fn duration_sec(&self) -> f64 {
60        self.samples.len() as f64 / 16000.0
61    }
62
63    /// Check if the audio is effectively silent (all samples near zero).
64    #[must_use]
65    pub fn is_silent(&self) -> bool {
66        self.samples.iter().all(|&s| s.abs() < 1e-4)
67    }
68}
69
70/// Metadata tracking the complete conversion pipeline journey.
71///
72/// Captures information from all four conversion stages.
73#[derive(Debug, Clone, Copy, PartialEq)]
74pub struct ConversionMetadata {
75    /// Detected audio format (WAV, MP3, FLAC, etc.).
76    pub original_format: AudioFormat,
77    /// Original sample rate in Hz before resampling.
78    pub original_sample_rate: u32,
79    /// Original number of channels before mixing.
80    pub original_channels: u8,
81    /// Original bit depth (if applicable, e.g., 16, 24 for PCM).
82    pub original_bit_depth: Option<u16>,
83    /// Peak amplitude in original audio before any processing.
84    pub peak_before: f32,
85    /// Peak amplitude after complete conversion pipeline.
86    pub peak_after: f32,
87    /// Total time spent in conversion pipeline (all 4 stages).
88    pub conversion_time_ms: f64,
89    /// Time spent in format detection stage.
90    pub detection_time_ms: f64,
91    /// Time spent in decoding stage.
92    pub decode_time_ms: f64,
93    /// Time spent in resampling stage.
94    pub resample_time_ms: f64,
95    /// Time spent in channel mixing stage.
96    pub mix_time_ms: f64,
97}
98
99impl ConversionMetadata {
100    /// Check if any stage exceeded expected latency budget.
101    ///
102    /// Expected budget breakdown for 3s clip:
103    /// - Detection: <1ms
104    /// - Decoding: <3ms
105    /// - Resampling: <5ms
106    /// - Mixing: <1ms
107    /// - **Total: <10ms**
108    #[must_use]
109    pub fn has_performance_issue(&self) -> bool {
110        self.conversion_time_ms > 10.0
111            || self.detection_time_ms > 1.0
112            || self.decode_time_ms > 3.0
113            || self.resample_time_ms > 5.0
114            || self.mix_time_ms > 1.0
115    }
116
117    /// Calculate the peak amplitude reduction ratio from conversion.
118    ///
119    /// Returns the ratio of final peak to original peak. Values:
120    /// - 1.0 = no amplitude change
121    /// - <1.0 = amplitude reduced (common with averaging/resampling)
122    /// - >1.0 = amplitude increased (rare, may indicate issue)
123    #[must_use]
124    pub fn peak_ratio(&self) -> f32 {
125        if self.peak_before.abs() < f32::EPSILON {
126            1.0 // Avoid division by zero for silent input
127        } else {
128            self.peak_after / self.peak_before
129        }
130    }
131}
132
133/// High-level audio format converter.
134///
135/// Detects common audio container formats and converts WAV/PCM input to the
136/// standard format: mono, 16kHz, normalized f32 samples.
137///
138/// ## Pipeline Architecture
139///
140/// ```text
141/// Input Bytes
142///     ↓
143/// [Format Detection] ← 6 formats: WAV, MP3, FLAC, Opus, WebM, AAC
144///     ↓
145/// [WAV Decoding] ← 16/24-bit PCM normalization
146///     ↓
147/// [Resampling] ← Arbitrary rate → 16kHz (linear interpolation)
148///     ↓
149/// [Channel Mixing] ← Multi-channel → Mono (simple averaging)
150///     ↓
151/// StandardAudio (mono, 16kHz, f32)
152/// ```
153///
154/// ## Current Scope Limitations
155///
156/// - **Formats**: Only WAV decoding implemented; other formats detected but not
157///   decoded
158/// - **Channel Counts**: 1, 2, 4, 6 channels supported
159/// - **Bit Depths**: 16-bit and 24-bit PCM only
160/// - **Resampling**: Linear interpolation (sinc reserved for future)
161///
162/// ## Example
163///
164/// ```rust,no_run
165/// use speech_prep::converter::AudioFormatConverter;
166///
167/// let wav_bytes = std::fs::read("audio.wav")?;
168/// let standard = AudioFormatConverter::convert_to_standard(&wav_bytes)?;
169///
170/// assert!(standard.metadata.original_sample_rate > 0);
171/// assert!(standard.metadata.original_channels > 0);
172/// # Ok::<(), Box<dyn std::error::Error>>(())
173/// ```
174#[derive(Debug, Default, Clone, Copy)]
175pub struct AudioFormatConverter;
176
177impl AudioFormatConverter {
178    /// Create a new audio format converter instance.
179    #[must_use]
180    pub const fn new() -> Self {
181        Self
182    }
183
184    /// Convert WAV audio bytes to standard format: mono, 16kHz, f32.
185    ///
186    /// This is the primary entry point for the audio normalization pipeline.
187    /// It composes all 4 stages: format detection, decoding, resampling, and
188    /// channel mixing.
189    ///
190    /// # Arguments
191    ///
192    /// * `audio_bytes` - Raw audio file bytes
193    ///
194    /// # Returns
195    ///
196    /// `StandardAudio` with mono 16kHz samples and complete conversion
197    /// metadata.
198    ///
199    /// # Errors
200    ///
201    /// Returns `Error::InvalidInput` if:
202    /// - Format detection fails (not a recognized audio format)
203    /// - Format is detected but not WAV (only WAV decoding supported)
204    /// - WAV decoding fails (malformed file, unsupported codec)
205    /// - Resampling fails (invalid sample rates)
206    /// - Channel mixing fails (unsupported channel count)
207    ///
208    /// # Performance
209    ///
210    /// Target: <10ms for 3-second audio clip on reference hardware.
211    /// Actual timing captured in `ConversionMetadata.conversion_time_ms`.
212    ///
213    /// # Example
214    ///
215    /// ```rust,no_run
216    /// use speech_prep::converter::AudioFormatConverter;
217    ///
218    /// let audio_bytes = std::fs::read("recording.wav")?;
219    /// let standard = AudioFormatConverter::convert_to_standard(&audio_bytes)?;
220    ///
221    /// assert_eq!(standard.metadata.original_format.as_str(), "wav");
222    /// assert!(standard.samples.len() > 0);
223    /// # Ok::<(), Box<dyn std::error::Error>>(())
224    /// ```
225    #[allow(clippy::cognitive_complexity)] // Large function — audio format conversion pipeline
226    pub fn convert_to_standard(audio_bytes: &[u8]) -> Result<StandardAudio> {
227        let pipeline_start = AudioInstant::now();
228
229        tracing::debug!(
230            audio_bytes_len = audio_bytes.len(),
231            "Starting audio format conversion pipeline"
232        );
233
234        let detection_start = AudioInstant::now();
235        let format_metadata = FormatDetector::detect(audio_bytes)?;
236        let detection_duration = elapsed_since(detection_start);
237        let detection_time_ms = detection_duration.as_secs_f64() * 1000.0;
238
239        tracing::debug!(
240            format = %format_metadata.format,
241            detection_time_ms,
242            "Format detection complete"
243        );
244
245        if format_metadata.format != AudioFormat::WavPcm {
246            return Err(Error::InvalidInput(format!(
247                "unsupported format for decoding: {} (only WAV supported)",
248                format_metadata.format.as_str()
249            )));
250        }
251
252        let decode_start = AudioInstant::now();
253        let decoded = WavDecoder::decode(audio_bytes)?;
254        let decode_duration = elapsed_since(decode_start);
255        let decode_time_ms = decode_duration.as_secs_f64() * 1000.0;
256
257        tracing::debug!(
258            sample_rate = decoded.sample_rate,
259            channels = decoded.channels,
260            bit_depth = decoded.bit_depth,
261            sample_count = decoded.samples.len(),
262            decode_time_ms,
263            "WAV decoding complete"
264        );
265
266        let peak_before = decoded
267            .samples
268            .iter()
269            .map(|s| s.abs())
270            .fold(0.0f32, f32::max);
271
272        let resample_start = AudioInstant::now();
273        let resampled = SampleRateConverter::resample_to_16khz(
274            &decoded.samples,
275            decoded.channels,
276            decoded.sample_rate,
277        )?;
278        let resample_duration = elapsed_since(resample_start);
279        let resample_time_ms = resample_duration.as_secs_f64() * 1000.0;
280
281        tracing::debug!(
282            input_rate = decoded.sample_rate,
283            output_rate = SampleRateConverter::TARGET_SAMPLE_RATE,
284            output_samples = resampled.len(),
285            resample_time_ms,
286            "Sample rate conversion complete"
287        );
288
289        let mix_start = AudioInstant::now();
290        let mixed = ChannelMixer::mix_to_mono(&resampled, decoded.channels)?;
291        let mix_duration = elapsed_since(mix_start);
292        let mix_time_ms = mix_duration.as_secs_f64() * 1000.0;
293
294        tracing::debug!(
295            input_channels = decoded.channels,
296            output_samples = mixed.samples.len(),
297            peak_before_mix = mixed.peak_before_mix,
298            peak_after_mix = mixed.peak_after_mix,
299            mix_time_ms,
300            "Channel mixing complete"
301        );
302
303        let conversion_duration = elapsed_since(pipeline_start);
304        let conversion_time_ms = conversion_duration.as_secs_f64() * 1000.0;
305
306        if conversion_time_ms > 10.0 {
307            tracing::warn!(
308                conversion_time_ms,
309                detection_time_ms,
310                decode_time_ms,
311                resample_time_ms,
312                mix_time_ms,
313                "Audio conversion exceeded 10ms target latency"
314            );
315        } else {
316            tracing::debug!(conversion_time_ms, "Audio conversion pipeline complete");
317        }
318
319        let metadata = ConversionMetadata {
320            original_format: format_metadata.format,
321            original_sample_rate: decoded.sample_rate,
322            original_channels: decoded.channels,
323            original_bit_depth: Some(decoded.bit_depth),
324            peak_before,
325            peak_after: mixed.peak_after_mix,
326            conversion_time_ms,
327            detection_time_ms,
328            decode_time_ms,
329            resample_time_ms,
330            mix_time_ms,
331        };
332
333        Ok(StandardAudio {
334            samples: mixed.samples,
335            metadata,
336        })
337    }
338}
339
340fn elapsed_since(start: AudioInstant) -> AudioDuration {
341    AudioInstant::now().duration_since(start)
342}
343
344#[cfg(test)]
345mod tests {
346    use super::*;
347
348    type TestResult<T> = std::result::Result<T, String>;
349
350    /// Create a minimal valid WAV file for testing.
351    fn create_test_wav(sample_rate: u32, channels: u16, samples: &[i16]) -> TestResult<Vec<u8>> {
352        let spec = hound::WavSpec {
353            sample_rate,
354            channels,
355            bits_per_sample: 16,
356            sample_format: hound::SampleFormat::Int,
357        };
358
359        let mut cursor = std::io::Cursor::new(Vec::new());
360        let mut writer = hound::WavWriter::new(&mut cursor, spec)
361            .map_err(|e| format!("failed to create WAV writer: {e}"))?;
362
363        for &sample in samples {
364            writer
365                .write_sample(sample)
366                .map_err(|e| format!("failed to write sample: {e}"))?;
367        }
368
369        writer
370            .finalize()
371            .map_err(|e| format!("failed to finalize WAV: {e}"))?;
372
373        Ok(cursor.into_inner())
374    }
375
376    #[test]
377    fn test_convert_mono_16khz_identity() -> TestResult<()> {
378        // Already in standard format: mono, 16kHz
379        let samples = vec![100i16, 200, -100, -200]; // Small amplitude
380        let wav = create_test_wav(16000, 1, &samples)?;
381
382        let standard =
383            AudioFormatConverter::convert_to_standard(&wav).map_err(|e| e.to_string())?;
384
385        // Should have 4 samples (already 16kHz mono)
386        assert_eq!(standard.samples.len(), 4);
387        assert_eq!(standard.metadata.original_sample_rate, 16000);
388        assert_eq!(standard.metadata.original_channels, 1);
389        assert_eq!(standard.metadata.original_format, AudioFormat::WavPcm);
390
391        Ok(())
392    }
393
394    #[test]
395    fn test_convert_stereo_44100_to_standard() -> TestResult<()> {
396        // Stereo 44.1kHz → mono 16kHz
397        let samples = vec![1000i16, -1000, 2000, -2000]; // 2 stereo frames
398        let wav = create_test_wav(44100, 2, &samples)?;
399
400        let standard =
401            AudioFormatConverter::convert_to_standard(&wav).map_err(|e| e.to_string())?;
402
403        // Should have ~1 sample after downsampling 44.1kHz → 16kHz and mixing stereo →
404        // mono Original: 2 frames at 44.1kHz = ~0.045ms
405        // At 16kHz: ~0.045ms * 16000 = ~0.72 samples (rounds to 1)
406        assert!(!standard.samples.is_empty());
407        assert_eq!(standard.metadata.original_sample_rate, 44100);
408        assert_eq!(standard.metadata.original_channels, 2);
409
410        Ok(())
411    }
412
413    #[test]
414    fn test_convert_tracks_timing() -> TestResult<()> {
415        let samples = vec![0i16; 1000]; // 1000 samples
416        let wav = create_test_wav(16000, 1, &samples)?;
417
418        let standard =
419            AudioFormatConverter::convert_to_standard(&wav).map_err(|e| e.to_string())?;
420
421        // All timing fields should be populated
422        assert!(standard.metadata.detection_time_ms >= 0.0);
423        assert!(standard.metadata.decode_time_ms >= 0.0);
424        assert!(standard.metadata.resample_time_ms >= 0.0);
425        assert!(standard.metadata.mix_time_ms >= 0.0);
426        assert!(standard.metadata.conversion_time_ms >= 0.0);
427
428        // Total time should be sum of stages (approximately, with measurement overhead)
429        let stage_sum = standard.metadata.detection_time_ms
430            + standard.metadata.decode_time_ms
431            + standard.metadata.resample_time_ms
432            + standard.metadata.mix_time_ms;
433
434        assert!(
435            (standard.metadata.conversion_time_ms - stage_sum).abs() < 1.0,
436            "total time {} should approximately equal stage sum {}",
437            standard.metadata.conversion_time_ms,
438            stage_sum
439        );
440
441        Ok(())
442    }
443
444    #[test]
445    fn test_convert_tracks_peaks() -> TestResult<()> {
446        // Create audio with known peak
447        let samples = vec![10000i16, -10000, 5000, -5000]; // Peak: 10000/32768 ≈ 0.305
448        let wav = create_test_wav(16000, 1, &samples)?;
449
450        let standard =
451            AudioFormatConverter::convert_to_standard(&wav).map_err(|e| e.to_string())?;
452
453        // Should track peaks
454        assert!(standard.metadata.peak_before > 0.0);
455        assert!(standard.metadata.peak_after > 0.0);
456
457        // Peak should be approximately 0.305
458        assert!(
459            (standard.metadata.peak_before - 0.305).abs() < 0.01,
460            "expected peak ~0.305, got {}",
461            standard.metadata.peak_before
462        );
463
464        Ok(())
465    }
466
467    #[test]
468    fn test_convert_rejects_non_wav() {
469        // Create fake MP3 header
470        let mp3_bytes = vec![0xFF, 0xFB, 0x90, 0x00]; // Valid MP3 frame start
471
472        let result = AudioFormatConverter::convert_to_standard(&mp3_bytes);
473
474        // Should detect as MP3 but reject for decoding (only WAV supported)
475        assert!(result.is_err());
476        if let Err(err) = result {
477            let err_msg = err.to_string();
478            assert!(err_msg.contains("MP3") || err_msg.contains("unsupported"));
479        }
480    }
481
482    #[test]
483    fn test_standard_audio_duration_calculation() -> TestResult<()> {
484        let samples = vec![0i16; 16000]; // 1 second at 16kHz
485        let wav = create_test_wav(16000, 1, &samples)?;
486
487        let standard =
488            AudioFormatConverter::convert_to_standard(&wav).map_err(|e| e.to_string())?;
489
490        // Should be 1.0 second
491        assert!((standard.duration_sec() - 1.0).abs() < 0.01);
492
493        Ok(())
494    }
495
496    #[test]
497    fn test_standard_audio_is_silent_detection() -> TestResult<()> {
498        let silent_samples = vec![0i16; 100];
499        let wav = create_test_wav(16000, 1, &silent_samples)?;
500
501        let standard =
502            AudioFormatConverter::convert_to_standard(&wav).map_err(|e| e.to_string())?;
503
504        assert!(standard.is_silent());
505
506        Ok(())
507    }
508
509    #[test]
510    fn test_conversion_metadata_peak_ratio() -> TestResult<()> {
511        let samples = vec![10000i16, -10000];
512        let wav = create_test_wav(16000, 1, &samples)?;
513
514        let standard =
515            AudioFormatConverter::convert_to_standard(&wav).map_err(|e| e.to_string())?;
516
517        // Peak ratio should be close to 1.0 for mono 16kHz (no mixing/resampling
518        // changes)
519        assert!(
520            (standard.metadata.peak_ratio() - 1.0).abs() < 0.1,
521            "expected peak ratio ~1.0, got {}",
522            standard.metadata.peak_ratio()
523        );
524
525        Ok(())
526    }
527}