subx_cli/services/audio/
analyzer.rs

1//! Audio analyzer based on the aus crate.
2
3use crate::services::audio::{AudioData, AudioEnvelope};
4use crate::{Result, error::SubXError};
5use aus::{AudioFile, WindowType, analysis, operations, spectrum};
6use std::path::Path;
7
8/// Audio analyzer based on aus.
9pub struct AusAudioAnalyzer {
10    sample_rate: u32,
11    window_size: usize,
12    hop_size: usize,
13}
14
15impl AusAudioAnalyzer {
16    /// Create a new analyzer and set the sample rate
17    pub fn new(sample_rate: u32) -> Self {
18        Self {
19            sample_rate,
20            window_size: 1024,
21            hop_size: 512,
22        }
23    }
24
25    /// Load audio file using aus
26    pub async fn load_audio_file<P: AsRef<Path>>(&self, audio_path: P) -> Result<AudioFile> {
27        let path = audio_path.as_ref();
28        let path_str = path
29            .to_str()
30            .ok_or_else(|| SubXError::audio_processing("Failed to convert path to UTF-8 string"))?;
31        let mut audio_file = aus::read(path_str)?;
32        if audio_file.num_channels > 1 {
33            aus::mixdown(&mut audio_file);
34        }
35
36        // Fix duration calculation issue
37        if audio_file.duration == 0.0 && !audio_file.samples[0].is_empty() {
38            audio_file.duration =
39                audio_file.samples[0].len() as f64 / audio_file.sample_rate as f64;
40        }
41
42        Ok(audio_file)
43    }
44
45    /// Load audio file and convert to AudioData format
46    pub async fn load_audio_data<P: AsRef<Path>>(&self, audio_path: P) -> Result<AudioData> {
47        let audio_file = self.load_audio_file(audio_path).await?;
48        let samples: Vec<f32> = audio_file.samples[0].iter().map(|&x| x as f32).collect();
49        Ok(AudioData {
50            samples,
51            sample_rate: audio_file.sample_rate,
52            channels: audio_file.num_channels,
53            duration: audio_file.duration as f32,
54        })
55    }
56
57    /// Extract audio energy envelope
58    pub async fn extract_envelope<P: AsRef<Path>>(&self, audio_path: P) -> Result<AudioEnvelope> {
59        let audio_file = self.load_audio_file(audio_path).await?;
60        let samples = &audio_file.samples[0];
61        let mut energy_samples = Vec::new();
62        for chunk in samples.chunks(self.hop_size) {
63            let rms_energy = operations::rms(chunk);
64            energy_samples.push(rms_energy as f32);
65        }
66
67        // Ensure duration is correct
68        let duration = if audio_file.duration > 0.0 {
69            audio_file.duration as f32
70        } else {
71            samples.len() as f32 / audio_file.sample_rate as f32
72        };
73
74        Ok(AudioEnvelope {
75            samples: energy_samples,
76            sample_rate: self.sample_rate,
77            duration,
78        })
79    }
80
81    /// Detect dialogue segments (legacy interface compatible)
82    pub fn detect_dialogue(
83        &self,
84        envelope: &AudioEnvelope,
85        threshold: f32,
86    ) -> Vec<crate::services::audio::DialogueSegment> {
87        let mut segments = Vec::new();
88        let mut in_dialogue = false;
89        let mut start = 0.0;
90        let time_per_sample = envelope.duration / envelope.samples.len() as f32;
91
92        for (i, &e) in envelope.samples.iter().enumerate() {
93            let t = i as f32 * time_per_sample;
94            if e > threshold && !in_dialogue {
95                in_dialogue = true;
96                start = t;
97            } else if e <= threshold && in_dialogue {
98                in_dialogue = false;
99                if t - start > 0.5 {
100                    segments.push(crate::services::audio::DialogueSegment {
101                        start_time: start,
102                        end_time: t,
103                        intensity: e,
104                    });
105                }
106            }
107        }
108
109        segments
110    }
111
112    /// Audio feature analysis using aus
113    pub async fn analyze_audio_features(&self, audio_file: &AudioFile) -> Result<AudioFeatures> {
114        let samples = &audio_file.samples[0];
115        let stft_result = spectrum::rstft(
116            samples,
117            self.window_size,
118            self.hop_size,
119            WindowType::Hanning,
120        );
121
122        let mut features = Vec::new();
123        for frame in stft_result.iter() {
124            let (magnitude_spectrum, _) = spectrum::complex_to_polar_rfft(frame);
125            let frequencies = spectrum::rfftfreq(self.window_size, audio_file.sample_rate);
126
127            let spectral_centroid = analysis::spectral_centroid(&magnitude_spectrum, &frequencies);
128            let spectral_entropy = analysis::spectral_entropy(&magnitude_spectrum);
129            let zero_crossing_rate = analysis::zero_crossing_rate(samples, audio_file.sample_rate);
130
131            features.push(FrameFeatures {
132                spectral_centroid: spectral_centroid as f32,
133                spectral_entropy: spectral_entropy as f32,
134                zero_crossing_rate: zero_crossing_rate as f32,
135            });
136        }
137
138        Ok(AudioFeatures { frames: features })
139    }
140}
141
142/// Audio feature data structure containing extracted characteristics.
143///
144/// Contains frame-by-frame audio features extracted from audio analysis,
145/// used for dialogue detection and subtitle synchronization.
146#[derive(Debug, Clone)]
147pub struct AudioFeatures {
148    /// Vector of feature data for each audio frame
149    pub frames: Vec<FrameFeatures>,
150}
151
152/// Feature data for a single audio frame.
153///
154/// Contains various audio characteristics computed for a short
155/// time window of audio data.
156#[derive(Debug, Clone)]
157pub struct FrameFeatures {
158    /// Spectral centroid indicating the "brightness" of the sound
159    pub spectral_centroid: f32,
160    /// Spectral entropy measuring randomness in the frequency domain
161    pub spectral_entropy: f32,
162    /// Zero crossing rate indicating the noisiness of the signal
163    pub zero_crossing_rate: f32,
164}
165
166#[cfg(test)]
167mod tests {
168    use super::*;
169    use std::fs;
170    use tempfile::TempDir;
171
172    /// Test audio file loading functionality
173    #[ignore]
174    #[tokio::test]
175    async fn test_load_audio_file_success() {
176        let analyzer = AusAudioAnalyzer::new(44100);
177        let temp_dir = TempDir::new().unwrap();
178        // Create mock WAV file (minimal valid WAV header)
179        let wav_data = create_minimal_wav_file(44100, 1, 1.0);
180        let wav_path = temp_dir.path().join("test.wav");
181        fs::write(&wav_path, wav_data).unwrap();
182
183        let result = analyzer.load_audio_file(&wav_path).await;
184        assert!(result.is_ok());
185
186        let audio_file = result.unwrap();
187        assert_eq!(audio_file.sample_rate, 44100);
188        assert!(audio_file.duration > 0.0);
189        assert_eq!(audio_file.num_channels, 1);
190    }
191
192    /// Test error handling for non-existent files
193    #[ignore]
194    #[tokio::test]
195    async fn test_load_audio_file_not_exists() {
196        let analyzer = AusAudioAnalyzer::new(44100);
197        let result = analyzer.load_audio_file("non_existent.wav").await;
198        assert!(result.is_err());
199    }
200
201    /// Test audio data format conversion
202    #[ignore]
203    #[tokio::test]
204    async fn test_load_audio_data_conversion() {
205        let analyzer = AusAudioAnalyzer::new(16000);
206        let temp_dir = TempDir::new().unwrap();
207
208        let wav_data = create_minimal_wav_file(16000, 1, 2.0);
209        let wav_path = temp_dir.path().join("test.wav");
210        fs::write(&wav_path, wav_data).unwrap();
211
212        let audio_data = analyzer.load_audio_data(&wav_path).await.unwrap();
213
214        assert_eq!(audio_data.sample_rate, 16000);
215        assert_eq!(audio_data.channels, 1);
216        assert!(audio_data.duration > 1.9 && audio_data.duration < 2.1);
217        assert!(!audio_data.samples.is_empty());
218    }
219
220    /// Test audio energy envelope extraction
221    #[ignore]
222    #[tokio::test]
223    async fn test_extract_envelope_features() {
224        let sample_rate = 44100;
225        let analyzer = AusAudioAnalyzer::new(sample_rate);
226        let temp_dir = TempDir::new().unwrap();
227
228        // Create audio file with varying energy levels
229        let wav_data = create_varying_energy_wav(44100, 2.0);
230        let wav_path = temp_dir.path().join("varying.wav");
231        fs::write(&wav_path, wav_data).unwrap();
232
233        let envelope = analyzer.extract_envelope(&wav_path).await.unwrap();
234
235        assert!(!envelope.samples.is_empty());
236        assert_eq!(envelope.sample_rate, sample_rate);
237        assert!(envelope.duration > 1.9);
238
239        // Verify energy values are within reasonable range
240        for &energy in &envelope.samples {
241            assert!(energy >= 0.0);
242            assert!(energy <= 1.0);
243        }
244    }
245
246    /// Test dialogue detection functionality
247    #[ignore]
248    #[tokio::test]
249    async fn test_detect_dialogue_segments() {
250        let analyzer = AusAudioAnalyzer::new(16000);
251
252        // Create mock audio envelope (containing speech and silence segments)
253        let envelope = AudioEnvelope {
254            samples: vec![
255                0.1, 0.8, 0.9, 0.7, 0.2, // Speech segment
256                0.05, 0.03, 0.02, 0.04, // Silence segment
257                0.6, 0.8, 0.7, 0.9, 0.5, // Speech segment
258            ],
259            sample_rate: 16000,
260            duration: 2.0,
261        };
262
263        let segments = analyzer.detect_dialogue(&envelope, 0.3);
264
265        assert!(!segments.is_empty());
266
267        // Verify detected speech segments
268        let speech_segments: Vec<_> = segments.iter().filter(|s| s.intensity > 0.3).collect();
269        assert!(speech_segments.len() >= 2);
270    }
271
272    /// Test audio feature analysis
273    #[ignore]
274    #[tokio::test]
275    async fn test_audio_features_analysis() {
276        let analyzer = AusAudioAnalyzer::new(44100);
277        let temp_dir = TempDir::new().unwrap();
278
279        let wav_data = create_spectral_rich_wav(44100, 1.0);
280        let wav_path = temp_dir.path().join("rich.wav");
281        fs::write(&wav_path, wav_data).unwrap();
282
283        let audio_file = analyzer.load_audio_file(&wav_path).await.unwrap();
284        let features = analyzer.analyze_audio_features(&audio_file).await.unwrap();
285
286        assert!(!features.frames.is_empty());
287
288        for frame in &features.frames {
289            // Verify spectral centroid is within reasonable range (0 to Nyquist frequency)
290            assert!(frame.spectral_centroid >= 0.0);
291            assert!(frame.spectral_centroid <= 22050.0);
292
293            // Verify spectral entropy
294            assert!(frame.spectral_entropy >= 0.0);
295            assert!(frame.spectral_entropy <= 1.0);
296
297            // Verify zero crossing rate
298            assert!(frame.zero_crossing_rate >= 0.0);
299            assert!(frame.zero_crossing_rate <= 1.0);
300        }
301    }
302
303    /// Test invalid audio format handling
304    #[ignore]
305    #[tokio::test]
306    async fn test_invalid_audio_format() {
307        let analyzer = AusAudioAnalyzer::new(44100);
308        let temp_dir = TempDir::new().unwrap();
309
310        // Create invalid audio file
311        let invalid_path = temp_dir.path().join("invalid.wav");
312        fs::write(&invalid_path, b"This is not audio data").unwrap();
313
314        let result = analyzer.load_audio_file(&invalid_path).await;
315        assert!(result.is_err());
316    }
317
318    /// Test large file processing and memory management
319    #[ignore]
320    #[tokio::test]
321    async fn test_large_file_memory_management() {
322        let analyzer = AusAudioAnalyzer::new(44100);
323        let temp_dir = TempDir::new().unwrap();
324
325        // Create larger audio file (10 seconds)
326        let wav_data = create_minimal_wav_file(44100, 1, 10.0);
327        let wav_path = temp_dir.path().join("large.wav");
328        fs::write(&wav_path, wav_data).unwrap();
329
330        let start_memory = get_memory_usage();
331        let _audio_data = analyzer.load_audio_data(&wav_path).await.unwrap();
332        let end_memory = get_memory_usage();
333
334        // Verify memory usage is within reasonable range (< 100MB growth)
335        assert!((end_memory - start_memory) < 100_000_000);
336    }
337
338    // Helper functions for creating test audio files
339    fn create_minimal_wav_file(sample_rate: u32, channels: u16, duration: f32) -> Vec<u8> {
340        let samples_per_channel = (sample_rate as f32 * duration) as u32;
341        let total_samples = samples_per_channel * channels as u32;
342        let data_size = total_samples * 2; // 16-bit samples
343        let mut wav_data = Vec::new();
344        // WAV header
345        wav_data.extend_from_slice(b"RIFF");
346        wav_data.extend_from_slice(&(36 + data_size).to_le_bytes());
347        wav_data.extend_from_slice(b"WAVE");
348        wav_data.extend_from_slice(b"fmt ");
349        wav_data.extend_from_slice(&16u32.to_le_bytes());
350        wav_data.extend_from_slice(&1u16.to_le_bytes()); // PCM
351        wav_data.extend_from_slice(&channels.to_le_bytes());
352        wav_data.extend_from_slice(&sample_rate.to_le_bytes());
353        wav_data.extend_from_slice(&(sample_rate * channels as u32 * 2).to_le_bytes());
354        wav_data.extend_from_slice(&(channels * 2).to_le_bytes());
355        wav_data.extend_from_slice(&16u16.to_le_bytes());
356        wav_data.extend_from_slice(b"data");
357        wav_data.extend_from_slice(&data_size.to_le_bytes());
358        // Audio data (simple sine wave)
359        for i in 0..total_samples {
360            let t = i as f32 / sample_rate as f32;
361            let amplitude = (2.0 * std::f32::consts::PI * 440.0 * t).sin();
362            let sample = (amplitude * 32767.0) as i16;
363            wav_data.extend_from_slice(&sample.to_le_bytes());
364        }
365        wav_data
366    }
367
368    fn create_varying_energy_wav(sample_rate: u32, duration: f32) -> Vec<u8> {
369        // Implementation for creating audio file with varying energy
370        create_minimal_wav_file(sample_rate, 1, duration)
371    }
372
373    fn create_spectral_rich_wav(sample_rate: u32, duration: f32) -> Vec<u8> {
374        // Implementation for creating spectrally rich audio file
375        create_minimal_wav_file(sample_rate, 1, duration)
376    }
377
378    fn get_memory_usage() -> usize {
379        // Simplified memory usage detection
380        0 // Actual implementation could use procfs or other system tools
381    }
382}