use {
std::{f32::consts::PI, time::Duration},
tokio::time::sleep,
voxudio::*,
};
#[tokio::test]
async fn test_complete_audio_pipeline() -> anyhow::Result<()> {
let sample_rate = 22050;
let duration_secs = 3.0;
let num_samples = (sample_rate as f32 * duration_secs) as usize;
let source_audio = (0..num_samples)
.map(|i| {
let t = i as f32 / sample_rate as f32;
let f0 = 120.0; let sample = 0.5 * (2.0 * PI * f0 * t).sin()
+ 0.3 * (2.0 * PI * f0 * 2.0 * t).sin()
+ 0.2 * (2.0 * PI * f0 * 3.0 * t).sin();
let envelope = 0.5 + 0.5 * (2.0 * PI * 0.5 * t).sin();
sample * envelope * 0.5
})
.collect::<Vec<_>>();
let target_audio = (0..num_samples)
.map(|i| {
let t = i as f32 / sample_rate as f32;
let f0 = 220.0; let sample = 0.5 * (2.0 * PI * f0 * t).sin()
+ 0.3 * (2.0 * PI * f0 * 2.0 * t).sin()
+ 0.2 * (2.0 * PI * f0 * 3.0 * t).sin();
let envelope = 0.5 + 0.5 * (2.0 * PI * 0.5 * t).sin();
sample * envelope * 0.5
})
.collect::<Vec<_>>();
println!("Step 1: Running voice activity detection");
let mut vad = VoiceActivityDetector::new("../checkpoint/voice_activity_detector.onnx")?;
let source_audio_16k = resample::<22050, 16000>(&source_audio, 1, 1)?;
let is_speech = vad.detect::<16000>(&source_audio_16k[..512]).await?;
println!("VAD detection result: is_speech = {}", is_speech);
println!("Step 2: Extracting speaker embeddings from source");
let mut see = SpeakerEmbeddingExtractor::new("../checkpoint/speaker_embedding_extractor.onnx")?;
let source_features = see.extract(&source_audio, 1).await?;
println!("Source feature dimensions: {}", source_features.len());
println!("Step 3: Extracting speaker embeddings from target");
let target_features = see.extract(&target_audio, 1).await?;
println!("Target feature dimensions: {}", target_features.len());
println!("Step 4: Running voice conversion model with target embeddings");
let mut tcc = ToneColorConverter::new("../checkpoint/tone_color_converter.onnx")?;
let converted_audio = tcc
.convert(&source_audio, &source_features, &target_features)
.await?;
println!("Converted audio length: {}", converted_audio.0.len());
println!("Integration test completed");
Ok(())
}
#[tokio::test]
async fn test_realtime_audio_processing() -> anyhow::Result<()> {
let Ok(mut collector) = AudioCollector::new() else {
println!("Failed to create AudioCollector: skipping test");
return Ok(());
};
let mut vad = VoiceActivityDetector::new("../checkpoint/voice_activity_detector.onnx")?;
let mut see = SpeakerEmbeddingExtractor::new("../checkpoint/speaker_embedding_extractor.onnx")?;
println!("Starting audio collector...");
collector.collect()?;
println!("Waiting for audio data collection...");
sleep(Duration::from_millis(500)).await;
let audio_data = collector.read::<44100>(2).await?;
println!("Read {} audio samples", audio_data.len());
println!("Stopping audio collector...");
collector.pause()?;
println!("Detecting speech activity with VAD...");
let is_speech = vad.detect::<48000>(&audio_data).await?;
println!("VAD detection result: is_speech = {}", is_speech);
if is_speech > 0.5 {
println!("Speech detected, extracting speaker features...");
let features = see.extract(&audio_data, 1).await?;
println!(
"Successfully extracted features, dimensions: {}",
features.len()
);
} else {
println!("No speech detected, skipping feature extraction");
}
Ok(())
}