use super::{SpeakerEmbedding, StyleVector, VoiceError, VoiceResult};
#[derive(Debug, Clone)]
pub struct CloningConfig {
pub min_reference_duration: f32,
pub max_reference_duration: f32,
pub sample_rate: u32,
pub embedding_dim: usize,
pub n_mels: usize,
pub enable_adaptation: bool,
pub similarity_threshold: f32,
}
impl Default for CloningConfig {
fn default() -> Self {
Self {
min_reference_duration: 3.0,
max_reference_duration: 30.0,
sample_rate: 22050,
embedding_dim: 256,
n_mels: 80,
enable_adaptation: false,
similarity_threshold: 0.75,
}
}
}
impl CloningConfig {
#[must_use]
pub fn few_shot() -> Self {
Self {
min_reference_duration: 3.0,
max_reference_duration: 10.0,
enable_adaptation: false,
..Self::default()
}
}
#[must_use]
pub fn zero_shot() -> Self {
Self {
min_reference_duration: 1.0,
max_reference_duration: 5.0,
enable_adaptation: false,
..Self::default()
}
}
#[must_use]
pub fn with_adaptation() -> Self {
Self {
enable_adaptation: true,
min_reference_duration: 10.0,
max_reference_duration: 60.0,
..Self::default()
}
}
pub fn validate(&self) -> VoiceResult<()> {
if self.min_reference_duration <= 0.0 {
return Err(VoiceError::InvalidConfig(
"min_reference_duration must be > 0".to_string(),
));
}
if self.max_reference_duration <= self.min_reference_duration {
return Err(VoiceError::InvalidConfig(
"max_reference_duration must be > min_reference_duration".to_string(),
));
}
if self.sample_rate == 0 {
return Err(VoiceError::InvalidConfig(
"sample_rate must be > 0".to_string(),
));
}
if self.embedding_dim == 0 {
return Err(VoiceError::InvalidConfig(
"embedding_dim must be > 0".to_string(),
));
}
if self.n_mels == 0 {
return Err(VoiceError::InvalidConfig("n_mels must be > 0".to_string()));
}
if !(0.0..=1.0).contains(&self.similarity_threshold) {
return Err(VoiceError::InvalidConfig(
"similarity_threshold must be in [0.0, 1.0]".to_string(),
));
}
Ok(())
}
#[must_use]
pub fn min_reference_samples(&self) -> usize {
(self.min_reference_duration * self.sample_rate as f32) as usize
}
#[must_use]
pub fn max_reference_samples(&self) -> usize {
(self.max_reference_duration * self.sample_rate as f32) as usize
}
}
#[derive(Debug, Clone)]
pub struct VoiceProfile {
speaker_id: String,
embedding: Option<SpeakerEmbedding>,
style: Option<StyleVector>,
reference_duration: f32,
quality_score: f32,
adapted: bool,
}
impl VoiceProfile {
#[must_use]
pub fn new(speaker_id: String) -> Self {
Self {
speaker_id,
embedding: None,
style: None,
reference_duration: 0.0,
quality_score: 0.0,
adapted: false,
}
}
#[must_use]
pub fn with_embedding(speaker_id: String, embedding: SpeakerEmbedding) -> Self {
Self {
speaker_id,
embedding: Some(embedding),
style: None,
reference_duration: 0.0,
quality_score: 0.5, adapted: false,
}
}
#[must_use]
pub fn speaker_id(&self) -> &str {
&self.speaker_id
}
#[must_use]
pub fn embedding(&self) -> Option<&SpeakerEmbedding> {
self.embedding.as_ref()
}
#[must_use]
pub fn style(&self) -> Option<&StyleVector> {
self.style.as_ref()
}
pub fn set_embedding(&mut self, embedding: SpeakerEmbedding) {
self.embedding = Some(embedding);
}
pub fn set_style(&mut self, style: StyleVector) {
self.style = Some(style);
}
#[must_use]
pub fn reference_duration(&self) -> f32 {
self.reference_duration
}
pub fn set_reference_duration(&mut self, duration: f32) {
self.reference_duration = duration;
}
#[must_use]
pub fn quality_score(&self) -> f32 {
self.quality_score
}
pub fn set_quality_score(&mut self, score: f32) {
self.quality_score = score.clamp(0.0, 1.0);
}
#[must_use]
pub fn is_adapted(&self) -> bool {
self.adapted
}
pub fn set_adapted(&mut self, adapted: bool) {
self.adapted = adapted;
}
#[must_use]
pub fn is_ready(&self) -> bool {
self.embedding.is_some()
}
pub fn similarity(&self, other: &Self) -> VoiceResult<f32> {
let emb_a = self
.embedding
.as_ref()
.ok_or_else(|| VoiceError::InvalidConfig("missing embedding in source".to_string()))?;
let emb_b = other
.embedding
.as_ref()
.ok_or_else(|| VoiceError::InvalidConfig("missing embedding in target".to_string()))?;
if emb_a.dim() != emb_b.dim() {
return Err(VoiceError::DimensionMismatch {
expected: emb_a.dim(),
got: emb_b.dim(),
});
}
let dot: f32 = emb_a
.as_slice()
.iter()
.zip(emb_b.as_slice().iter())
.map(|(a, b)| a * b)
.sum();
let norm_a = emb_a.l2_norm();
let norm_b = emb_b.l2_norm();
if norm_a < f32::EPSILON || norm_b < f32::EPSILON {
return Ok(0.0);
}
Ok((dot / (norm_a * norm_b)).clamp(-1.0, 1.0))
}
}
pub trait VoiceCloner {
fn create_profile(
&self,
reference_audio: &[f32],
speaker_id: &str,
) -> VoiceResult<VoiceProfile>;
fn synthesize(&self, text: &str, profile: &VoiceProfile) -> VoiceResult<Vec<f32>>;
fn adapt(&self, profile: &mut VoiceProfile, additional_audio: &[f32]) -> VoiceResult<()>;
fn config(&self) -> &CloningConfig;
}
pub trait SpeakerEncoder {
fn encode(&self, audio: &[f32]) -> VoiceResult<SpeakerEmbedding>;
fn embedding_dim(&self) -> usize;
}
#[derive(Debug)]
pub struct YourTtsCloner {
config: CloningConfig,
}
impl YourTtsCloner {
#[must_use]
pub fn new(config: CloningConfig) -> Self {
Self { config }
}
#[must_use]
pub fn default_config() -> Self {
Self::new(CloningConfig::default())
}
}
include!("sv2tts_speaker_encoder.rs");
include!("clone_tests.rs");