Skip to main content

subx_cli/services/vad/
audio_processor.rs

1use crate::Result;
2use crate::services::vad::audio_loader::DirectAudioLoader;
3use crate::services::vad::detector::AudioInfo;
4use std::path::Path;
5
6/// Audio processor for VAD operations.
7///
8/// Handles loading, resampling, and format conversion of audio files
9/// for voice activity detection processing.
10/// Audio processor for VAD operations, optimized to use original sample rate and first channel only.
11pub struct VadAudioProcessor {}
12
13/// Processed audio data ready for VAD analysis.
14///
15/// Contains the audio samples and metadata after processing
16/// and format conversion.
17#[derive(Debug, Clone)]
18pub struct ProcessedAudioData {
19    /// Audio samples as 16-bit integers
20    pub samples: Vec<i16>,
21    /// Audio metadata and properties
22    pub info: AudioInfo,
23}
24
25impl VadAudioProcessor {
26    /// Create a new VAD audio processor.
27    ///
28    /// # Arguments
29    ///
30    /// * `target_sample_rate` - Desired sample rate for processing
31    /// * `target_channels` - Desired number of audio channels
32    ///
33    /// # Returns
34    ///
35    /// A new `VadAudioProcessor` instance
36    /// Create a new VAD audio processor.
37    pub fn new() -> Result<Self> {
38        Ok(Self {})
39    }
40
41    /// Load and prepare audio file for VAD processing.
42    ///
43    /// Performs all necessary audio processing steps including loading,
44    /// resampling, and format conversion to prepare the audio for
45    /// voice activity detection.
46    ///
47    /// # Arguments
48    ///
49    /// * `audio_path` - Path to the audio file to process
50    ///
51    /// # Returns
52    ///
53    /// Processed audio data ready for VAD analysis
54    ///
55    /// # Errors
56    ///
57    /// Returns an error if:
58    /// - Audio file cannot be loaded
59    /// - Audio format is unsupported
60    /// - Resampling fails
61    /// - Format conversion fails
62    ///
63    /// Directly loads and prepares audio files for VAD processing, supporting multiple formats.
64    /// Load and prepare audio file for VAD processing.
65    ///
66    /// Uses original sample rate and first channel only.
67    pub async fn load_and_prepare_audio_direct(
68        &self,
69        audio_path: &Path,
70    ) -> Result<ProcessedAudioData> {
71        // 1. Load with DirectAudioLoader in a blocking task to avoid stalling
72        //    the async runtime during synchronous decoding / filesystem access.
73        let audio_path_buf = audio_path.to_path_buf();
74        let load_result =
75            tokio::task::spawn_blocking(move || -> Result<Option<(Vec<i16>, AudioInfo)>> {
76                let loader = DirectAudioLoader::new()?;
77                // Defense-in-depth fallback limit; matches the default value
78                // of `general.max_audio_bytes`. Production callers invoking
79                // `DirectAudioLoader::load_audio_samples` directly should pass
80                // the configured value from `GeneralConfig`.
81                const DEFAULT_MAX_AUDIO_BYTES: u64 = 2_147_483_648;
82                match loader.load_audio_samples(&audio_path_buf, DEFAULT_MAX_AUDIO_BYTES) {
83                    Ok((samples, info)) => Ok(Some((samples, info))),
84                    Err(e) => {
85                        // If the file is empty, return None to signal empty samples
86                        if let Ok(metadata) = std::fs::metadata(&audio_path_buf) {
87                            if metadata.len() == 0 {
88                                return Ok(None);
89                            }
90                        }
91                        Err(e)
92                    }
93                }
94            })
95            .await
96            .map_err(|e| crate::error::SubXError::audio_processing(e.to_string()))??;
97
98        let (samples, info) = match load_result {
99            Some(v) => v,
100            None => {
101                return Ok(ProcessedAudioData {
102                    samples: vec![],
103                    info: AudioInfo {
104                        sample_rate: 16000, // Default value
105                        channels: 1,
106                        duration_seconds: 0.0,
107                        total_samples: 0,
108                    },
109                });
110            }
111        };
112
113        // 2. Extract first channel if multi-channel, retain original sample rate
114        let mono_samples = if info.channels == 1 {
115            samples
116        } else {
117            self.extract_first_channel(&samples, info.channels as usize)
118        };
119        let mono_info = AudioInfo {
120            sample_rate: info.sample_rate,
121            channels: 1,
122            duration_seconds: info.duration_seconds,
123            total_samples: mono_samples.len(),
124        };
125        Ok(ProcessedAudioData {
126            samples: mono_samples,
127            info: mono_info,
128        })
129    }
130
131    // Removed resampling and multi-channel averaging methods
132
133    /// Extract the first channel samples from interleaved multi-channel data.
134    fn extract_first_channel(&self, samples: &[i16], channels: usize) -> Vec<i16> {
135        samples.iter().step_by(channels).copied().collect()
136    }
137}