impl VoiceCloner for YourTtsCloner {
fn create_profile(
&self,
reference_audio: &[f32],
_speaker_id: &str,
) -> VoiceResult<VoiceProfile> {
if reference_audio.is_empty() {
return Err(VoiceError::InvalidAudio(
"empty reference audio".to_string(),
));
}
let min_samples = self.config.min_reference_samples();
if reference_audio.len() < min_samples {
return Err(VoiceError::InvalidAudio(format!(
"reference audio too short: {} samples, need at least {}",
reference_audio.len(),
min_samples
)));
}
Err(VoiceError::NotImplemented(
"YourTTS requires model weights".to_string(),
))
}
fn synthesize(&self, text: &str, profile: &VoiceProfile) -> VoiceResult<Vec<f32>> {
if text.is_empty() {
return Err(VoiceError::InvalidConfig("empty text".to_string()));
}
if !profile.is_ready() {
return Err(VoiceError::ModelNotLoaded);
}
Err(VoiceError::NotImplemented(
"YourTTS requires model weights".to_string(),
))
}
fn adapt(&self, profile: &mut VoiceProfile, additional_audio: &[f32]) -> VoiceResult<()> {
if additional_audio.is_empty() {
return Err(VoiceError::InvalidAudio("empty audio".to_string()));
}
if !self.config.enable_adaptation {
return Err(VoiceError::InvalidConfig(
"adaptation not enabled in config".to_string(),
));
}
let _ = profile;
Err(VoiceError::NotImplemented(
"YourTTS adaptation requires model weights".to_string(),
))
}
fn config(&self) -> &CloningConfig {
&self.config
}
}
#[derive(Debug)]
pub struct Sv2TtsSpeakerEncoder {
embedding_dim: usize,
}
impl Sv2TtsSpeakerEncoder {
#[must_use]
pub fn new(embedding_dim: usize) -> Self {
Self { embedding_dim }
}
#[must_use]
pub fn default_config() -> Self {
Self::new(256)
}
}
impl SpeakerEncoder for Sv2TtsSpeakerEncoder {
fn encode(&self, audio: &[f32]) -> VoiceResult<SpeakerEmbedding> {
if audio.is_empty() {
return Err(VoiceError::InvalidAudio("empty audio".to_string()));
}
Err(VoiceError::NotImplemented(
"SV2TTS requires model weights".to_string(),
))
}
fn embedding_dim(&self) -> usize {
self.embedding_dim
}
}
pub fn verify_same_speaker(
profile_a: &VoiceProfile,
profile_b: &VoiceProfile,
threshold: f32,
) -> VoiceResult<bool> {
let similarity = profile_a.similarity(profile_b)?;
Ok(similarity >= threshold)
}
#[must_use]
pub fn estimate_quality(audio: &[f32], sample_rate: u32) -> f32 {
if audio.is_empty() || sample_rate == 0 {
return 0.0;
}
let duration = audio.len() as f32 / sample_rate as f32;
let duration_score = if duration < 3.0 {
duration / 3.0
} else if duration > 30.0 {
30.0 / duration
} else {
1.0
};
let rms: f32 = (audio.iter().map(|x| x * x).sum::<f32>() / audio.len() as f32).sqrt();
let energy_score = (rms * 10.0).min(1.0);
let frame_size = sample_rate as usize / 100; let frame_count = audio.len().saturating_sub(frame_size) / (frame_size / 2);
if frame_count == 0 {
return duration_score * energy_score;
}
let threshold = rms * 0.1;
let mut active_frames = 0_usize;
for i in 0..frame_count {
let start = i * (frame_size / 2);
let end = (start + frame_size).min(audio.len());
let frame_energy: f32 =
(audio[start..end].iter().map(|x| x * x).sum::<f32>() / (end - start) as f32).sqrt();
if frame_energy > threshold {
active_frames += 1;
}
}
let activity_score = active_frames as f32 / frame_count as f32;
duration_score * 0.3 + energy_score * 0.3 + activity_score * 0.4
}
pub fn merge_profiles(profiles: &[VoiceProfile]) -> VoiceResult<VoiceProfile> {
if profiles.is_empty() {
return Err(VoiceError::InvalidConfig(
"cannot merge empty profile list".to_string(),
));
}
let embeddings: Vec<&SpeakerEmbedding> = profiles
.iter()
.filter_map(|p| p.embedding.as_ref())
.collect();
if embeddings.is_empty() {
return Err(VoiceError::InvalidConfig(
"no profiles have embeddings".to_string(),
));
}
let dim = embeddings[0].dim();
for emb in &embeddings {
if emb.dim() != dim {
return Err(VoiceError::DimensionMismatch {
expected: dim,
got: emb.dim(),
});
}
}
let mut avg = vec![0.0_f32; dim];
let count = embeddings.len() as f32;
for emb in &embeddings {
for (i, &val) in emb.as_slice().iter().enumerate() {
avg[i] += val / count;
}
}
let total_duration: f32 = profiles.iter().map(|p| p.reference_duration).sum();
let avg_quality: f32 =
profiles.iter().map(|p| p.quality_score).sum::<f32>() / profiles.len() as f32;
let any_adapted = profiles.iter().any(VoiceProfile::is_adapted);
let speaker_id = profiles[0].speaker_id.clone();
let mut merged = VoiceProfile::new(speaker_id);
merged.set_embedding(SpeakerEmbedding::from_vec(avg));
merged.set_reference_duration(total_duration);
merged.set_quality_score(avg_quality);
merged.set_adapted(any_adapted);
Ok(merged)
}