use super::{SpeakerEmbedding, VoiceError, VoiceResult};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum ConversionMode {
OneToOne,
ManyToOne,
OneToMany,
#[default]
AnyToAny,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum BottleneckType {
Ppg,
#[default]
AutoEncoder,
VectorQuantized,
ContentEmbedding,
}
#[derive(Debug, Clone)]
pub struct VoiceConversionConfig {
pub mode: ConversionMode,
pub bottleneck: BottleneckType,
pub speaker_dim: usize,
pub content_dim: usize,
pub sample_rate: u32,
pub frame_shift_ms: u32,
pub convert_f0: bool,
pub convert_energy: bool,
pub pitch_ratio: f32,
pub formant_preservation: f32,
}
impl Default for VoiceConversionConfig {
fn default() -> Self {
Self {
mode: ConversionMode::default(),
bottleneck: BottleneckType::default(),
speaker_dim: 256,
content_dim: 512,
sample_rate: 16000,
frame_shift_ms: 10,
convert_f0: true,
convert_energy: true,
pitch_ratio: 1.0,
formant_preservation: 0.5,
}
}
}
impl VoiceConversionConfig {
#[must_use]
pub fn autovc() -> Self {
Self {
mode: ConversionMode::AnyToAny,
bottleneck: BottleneckType::AutoEncoder,
speaker_dim: 256,
content_dim: 512,
..Self::default()
}
}
#[must_use]
pub fn stargan_vc() -> Self {
Self {
mode: ConversionMode::ManyToOne,
bottleneck: BottleneckType::AutoEncoder,
speaker_dim: 64,
content_dim: 256,
formant_preservation: 0.0,
..Self::default()
}
}
#[must_use]
pub fn ppg_based() -> Self {
Self {
mode: ConversionMode::AnyToAny,
bottleneck: BottleneckType::Ppg,
speaker_dim: 256,
content_dim: 144, formant_preservation: 0.3,
..Self::default()
}
}
#[must_use]
pub fn realtime() -> Self {
Self {
mode: ConversionMode::ManyToOne,
bottleneck: BottleneckType::AutoEncoder,
frame_shift_ms: 5, formant_preservation: 0.7,
..Self::default()
}
}
pub fn validate(&self) -> VoiceResult<()> {
if self.speaker_dim == 0 {
return Err(VoiceError::InvalidConfig(
"speaker_dim must be > 0".to_string(),
));
}
if self.content_dim == 0 {
return Err(VoiceError::InvalidConfig(
"content_dim must be > 0".to_string(),
));
}
if self.sample_rate == 0 {
return Err(VoiceError::InvalidConfig(
"sample_rate must be > 0".to_string(),
));
}
if self.frame_shift_ms == 0 {
return Err(VoiceError::InvalidConfig(
"frame_shift_ms must be > 0".to_string(),
));
}
if !(0.0..=10.0).contains(&self.pitch_ratio) {
return Err(VoiceError::InvalidConfig(
"pitch_ratio must be in [0.0, 10.0]".to_string(),
));
}
if !(0.0..=1.0).contains(&self.formant_preservation) {
return Err(VoiceError::InvalidConfig(
"formant_preservation must be in [0.0, 1.0]".to_string(),
));
}
Ok(())
}
#[must_use]
pub fn frame_samples(&self) -> usize {
(self.sample_rate as usize * self.frame_shift_ms as usize) / 1000
}
}
#[derive(Debug, Clone)]
pub struct ConversionResult {
pub audio: Vec<f32>,
pub sample_rate: u32,
pub confidence: f32,
pub duration_secs: f32,
pub source_similarity: f32,
pub target_similarity: f32,
}
impl ConversionResult {
#[must_use]
pub fn new(audio: Vec<f32>, sample_rate: u32) -> Self {
let duration_secs = if sample_rate > 0 {
audio.len() as f32 / sample_rate as f32
} else {
0.0
};
Self {
audio,
sample_rate,
confidence: 0.0,
duration_secs,
source_similarity: 0.0,
target_similarity: 0.0,
}
}
#[must_use]
pub fn with_metrics(mut self, confidence: f32, source_sim: f32, target_sim: f32) -> Self {
self.confidence = confidence.clamp(0.0, 1.0);
self.source_similarity = source_sim.clamp(0.0, 1.0);
self.target_similarity = target_sim.clamp(0.0, 1.0);
self
}
}
pub trait VoiceConverter: Send + Sync {
fn config(&self) -> &VoiceConversionConfig;
fn convert(
&self,
source_audio: &[f32],
source_embedding: Option<&SpeakerEmbedding>,
target_embedding: &SpeakerEmbedding,
) -> VoiceResult<ConversionResult>;
fn extract_content(&self, audio: &[f32]) -> VoiceResult<Vec<Vec<f32>>>;
fn synthesize(&self, content: &[Vec<f32>], speaker: &SpeakerEmbedding)
-> VoiceResult<Vec<f32>>;
}
#[derive(Debug, Clone)]
pub struct AutoVcConverter {
config: VoiceConversionConfig,
downsample_factor: usize,
}
impl AutoVcConverter {
#[must_use]
pub fn new(config: VoiceConversionConfig) -> Self {
Self {
config,
downsample_factor: 32, }
}
#[must_use]
pub fn default_autovc() -> Self {
Self::new(VoiceConversionConfig::autovc())
}
#[must_use]
pub fn downsample_factor(&self) -> usize {
self.downsample_factor
}
}
include!("ppg_converter.rs");
include!("conversion_tests.rs");