subx_cli/services/vad/audio_processor.rs
1use crate::Result;
2use crate::services::vad::audio_loader::DirectAudioLoader;
3use crate::services::vad::detector::AudioInfo;
4use std::path::Path;
5
6/// Audio processor for VAD operations.
7///
8/// Handles loading, resampling, and format conversion of audio files
9/// for voice activity detection processing.
10/// Audio processor for VAD operations, optimized to use original sample rate and first channel only.
11pub struct VadAudioProcessor {}
12
13/// Processed audio data ready for VAD analysis.
14///
15/// Contains the audio samples and metadata after processing
16/// and format conversion.
17#[derive(Debug, Clone)]
18pub struct ProcessedAudioData {
19 /// Audio samples as 16-bit integers
20 pub samples: Vec<i16>,
21 /// Audio metadata and properties
22 pub info: AudioInfo,
23}
24
25impl VadAudioProcessor {
26 /// Create a new VAD audio processor.
27 ///
28 /// # Arguments
29 ///
30 /// * `target_sample_rate` - Desired sample rate for processing
31 /// * `target_channels` - Desired number of audio channels
32 ///
33 /// # Returns
34 ///
35 /// A new `VadAudioProcessor` instance
36 /// Create a new VAD audio processor.
37 pub fn new() -> Result<Self> {
38 Ok(Self {})
39 }
40
41 /// Load and prepare audio file for VAD processing.
42 ///
43 /// Performs all necessary audio processing steps including loading,
44 /// resampling, and format conversion to prepare the audio for
45 /// voice activity detection.
46 ///
47 /// # Arguments
48 ///
49 /// * `audio_path` - Path to the audio file to process
50 ///
51 /// # Returns
52 ///
53 /// Processed audio data ready for VAD analysis
54 ///
55 /// # Errors
56 ///
57 /// Returns an error if:
58 /// - Audio file cannot be loaded
59 /// - Audio format is unsupported
60 /// - Resampling fails
61 /// - Format conversion fails
62 ///
63 /// Directly loads and prepares audio files for VAD processing, supporting multiple formats.
64 /// Load and prepare audio file for VAD processing.
65 ///
66 /// Uses original sample rate and first channel only.
67 pub async fn load_and_prepare_audio_direct(
68 &self,
69 audio_path: &Path,
70 ) -> Result<ProcessedAudioData> {
71 // 1. Load with DirectAudioLoader in a blocking task to avoid stalling
72 // the async runtime during synchronous decoding / filesystem access.
73 let audio_path_buf = audio_path.to_path_buf();
74 let load_result =
75 tokio::task::spawn_blocking(move || -> Result<Option<(Vec<i16>, AudioInfo)>> {
76 let loader = DirectAudioLoader::new()?;
77 // Defense-in-depth fallback limit; matches the default value
78 // of `general.max_audio_bytes`. Production callers invoking
79 // `DirectAudioLoader::load_audio_samples` directly should pass
80 // the configured value from `GeneralConfig`.
81 const DEFAULT_MAX_AUDIO_BYTES: u64 = 2_147_483_648;
82 match loader.load_audio_samples(&audio_path_buf, DEFAULT_MAX_AUDIO_BYTES) {
83 Ok((samples, info)) => Ok(Some((samples, info))),
84 Err(e) => {
85 // If the file is empty, return None to signal empty samples
86 if let Ok(metadata) = std::fs::metadata(&audio_path_buf) {
87 if metadata.len() == 0 {
88 return Ok(None);
89 }
90 }
91 Err(e)
92 }
93 }
94 })
95 .await
96 .map_err(|e| crate::error::SubXError::audio_processing(e.to_string()))??;
97
98 let (samples, info) = match load_result {
99 Some(v) => v,
100 None => {
101 return Ok(ProcessedAudioData {
102 samples: vec![],
103 info: AudioInfo {
104 sample_rate: 16000, // Default value
105 channels: 1,
106 duration_seconds: 0.0,
107 total_samples: 0,
108 },
109 });
110 }
111 };
112
113 // 2. Extract first channel if multi-channel, retain original sample rate
114 let mono_samples = if info.channels == 1 {
115 samples
116 } else {
117 self.extract_first_channel(&samples, info.channels as usize)
118 };
119 let mono_info = AudioInfo {
120 sample_rate: info.sample_rate,
121 channels: 1,
122 duration_seconds: info.duration_seconds,
123 total_samples: mono_samples.len(),
124 };
125 Ok(ProcessedAudioData {
126 samples: mono_samples,
127 info: mono_info,
128 })
129 }
130
131 // Removed resampling and multi-channel averaging methods
132
133 /// Extract the first channel samples from interleaved multi-channel data.
134 fn extract_first_channel(&self, samples: &[i16], channels: usize) -> Vec<i16> {
135 samples.iter().step_by(channels).copied().collect()
136 }
137}