1use crate::types::VoiceCharacteristics;
4use crate::Result;
5use serde::{Deserialize, Serialize};
6use std::collections::HashMap;
7use std::time::Instant;
8
9#[derive(Debug, Clone, Serialize, Deserialize)]
11pub struct SpeakerEmbedding {
12 pub data: Vec<f32>,
14 pub confidence: f32,
16}
17
18pub struct ReferenceVoiceDatabase {
20 voices: HashMap<String, ReferenceVoice>,
22
23 embeddings: HashMap<String, SpeakerEmbedding>,
25
26 characteristics: HashMap<String, VoiceCharacteristics>,
28
29 usage_stats: HashMap<String, UsageStatistics>,
31
32 metadata: DatabaseMetadata,
34}
35
36#[derive(Debug, Clone, Serialize, Deserialize)]
38pub struct ReferenceVoice {
39 pub speaker_id: String,
41
42 pub name: String,
44
45 pub audio_samples: Vec<AudioSample>,
47
48 pub embedding: SpeakerEmbedding,
50
51 pub characteristics: VoiceCharacteristics,
53
54 pub quality_scores: QualityScores,
56
57 pub metadata: VoiceMetadata,
59
60 #[serde(skip)]
62 pub last_used: Option<Instant>,
63}
64
65#[derive(Debug, Clone, Serialize, Deserialize)]
67pub struct AudioSample {
68 pub id: String,
70
71 #[serde(skip)]
73 pub audio_data: Vec<f32>,
74
75 pub sample_rate: u32,
77
78 pub duration: f32,
80
81 pub transcription: Option<String>,
83
84 pub quality_score: f32,
86
87 pub phonetic_content: PhoneticAnalysis,
89}
90
91#[derive(Debug, Clone, Serialize, Deserialize)]
93pub struct QualityScores {
94 pub overall: f32,
96
97 pub clarity: f32,
99
100 pub naturalness: f32,
102
103 pub consistency: f32,
105
106 pub recording_quality: f32,
108
109 pub prosody_quality: f32,
111}
112
113#[derive(Debug, Clone, Serialize, Deserialize)]
115pub struct VoiceMetadata {
116 pub language: String,
118
119 pub accent: Option<String>,
121
122 pub gender: Option<String>,
124
125 pub age_group: Option<String>,
127
128 pub recording_environment: Option<String>,
130
131 pub tags: Vec<String>,
133
134 #[serde(
136 skip_serializing,
137 skip_deserializing,
138 default = "std::time::Instant::now"
139 )]
140 pub created: Instant,
141
142 #[serde(skip_serializing, skip_deserializing, default)]
144 pub modified: Option<Instant>,
145}
146
147#[derive(Debug, Clone, Serialize, Deserialize)]
149pub struct PhoneticAnalysis {
150 pub phoneme_distribution: HashMap<String, f32>,
152
153 pub diversity_score: f32,
155
156 pub vowel_consonant_ratio: f32,
158
159 pub prosodic_features: ProsodicFeatures,
161}
162
163#[derive(Debug, Clone, Serialize, Deserialize)]
165pub struct ProsodicFeatures {
166 pub mean_f0: f32,
168
169 pub f0_range: (f32, f32),
171
172 pub speaking_rate: f32,
174
175 pub pause_patterns: Vec<f32>,
177
178 pub stress_patterns: Vec<f32>,
180}
181
182#[derive(Debug, Clone, Serialize, Deserialize)]
184pub struct UsageStatistics {
185 pub usage_count: u64,
187
188 pub avg_similarity: f32,
190
191 pub success_rate: f32,
193
194 #[serde(skip)]
196 pub last_used: Option<Instant>,
197
198 pub preferred_contexts: Vec<String>,
200}
201
202#[derive(Debug, Clone)]
204pub struct DatabaseMetadata {
205 pub total_voices: usize,
207
208 pub total_duration: f32,
210
211 pub languages: Vec<String>,
213
214 pub last_updated: Instant,
216
217 pub version: String,
219
220 pub index_stats: IndexStatistics,
222}
223
224#[derive(Debug, Clone, Serialize, Deserialize)]
226pub struct IndexStatistics {
227 pub embedding_index_size: usize,
229
230 pub characteristic_index_size: usize,
232
233 pub search_performance: SearchPerformanceMetrics,
235}
236
237#[derive(Debug, Clone, Serialize, Deserialize)]
239pub struct SearchPerformanceMetrics {
240 pub avg_search_time: f32,
242
243 pub cache_hit_rate: f32,
245
246 pub index_efficiency: f32,
248}
249
250impl Default for ReferenceVoiceDatabase {
251 fn default() -> Self {
252 Self::new()
253 }
254}
255
256impl ReferenceVoiceDatabase {
257 pub fn new() -> Self {
263 Self {
264 voices: HashMap::new(),
265 embeddings: HashMap::new(),
266 characteristics: HashMap::new(),
267 usage_stats: HashMap::new(),
268 metadata: DatabaseMetadata {
269 total_voices: 0,
270 total_duration: 0.0,
271 languages: Vec::new(),
272 last_updated: Instant::now(),
273 version: "1.0.0".to_string(),
274 index_stats: IndexStatistics {
275 embedding_index_size: 0,
276 characteristic_index_size: 0,
277 search_performance: SearchPerformanceMetrics {
278 avg_search_time: 0.0,
279 cache_hit_rate: 0.0,
280 index_efficiency: 1.0,
281 },
282 },
283 },
284 }
285 }
286
287 pub fn add_voice(&mut self, voice: ReferenceVoice) -> Result<()> {
301 let speaker_id = voice.speaker_id.clone();
302 self.embeddings
303 .insert(speaker_id.clone(), voice.embedding.clone());
304 self.characteristics
305 .insert(speaker_id.clone(), voice.characteristics.clone());
306 self.usage_stats.insert(
307 speaker_id.clone(),
308 UsageStatistics {
309 usage_count: 0,
310 avg_similarity: 0.0,
311 success_rate: 0.0,
312 last_used: None,
313 preferred_contexts: Vec::new(),
314 },
315 );
316 self.voices.insert(speaker_id, voice);
317 self.metadata.total_voices += 1;
318 Ok(())
319 }
320
321 pub fn remove_voice(&mut self, speaker_id: &str) -> Result<()> {
335 self.voices.remove(speaker_id);
336 self.embeddings.remove(speaker_id);
337 self.characteristics.remove(speaker_id);
338 self.usage_stats.remove(speaker_id);
339 if self.metadata.total_voices > 0 {
340 self.metadata.total_voices -= 1;
341 }
342 Ok(())
343 }
344
345 pub fn find_similar_voices(
360 &self,
361 target_characteristics: &VoiceCharacteristics,
362 max_voices: usize,
363 ) -> Result<Vec<ReferenceVoice>> {
364 let mut similarities = Vec::new();
365
366 for voice in self.voices.values() {
367 let similarity =
368 self.calculate_similarity(&voice.characteristics, target_characteristics);
369 similarities.push((similarity, voice.clone()));
370 }
371
372 similarities.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
374
375 Ok(similarities
377 .into_iter()
378 .take(max_voices)
379 .map(|(_, voice)| voice)
380 .collect())
381 }
382
383 pub fn metadata(&self) -> &DatabaseMetadata {
392 &self.metadata
393 }
394
395 fn calculate_similarity(
396 &self,
397 voice1: &VoiceCharacteristics,
398 voice2: &VoiceCharacteristics,
399 ) -> f32 {
400 let mut similarity = 0.0;
402 let mut total_weight: f32 = 0.0;
403
404 if voice1.gender == voice2.gender {
406 similarity += 0.25;
407 } else if voice1.gender.is_some() && voice2.gender.is_some() {
408 similarity += 0.1; }
410 total_weight += 0.25;
411
412 if voice1.age_group == voice2.age_group {
414 similarity += 0.15;
415 } else if voice1.age_group.is_some() && voice2.age_group.is_some() {
416 let age_similarity = match (voice1.age_group, voice2.age_group) {
418 (Some(a1), Some(a2)) => {
419 use crate::types::AgeGroup;
420 match (a1, a2) {
421 (AgeGroup::YoungAdult, AgeGroup::MiddleAged)
422 | (AgeGroup::MiddleAged, AgeGroup::YoungAdult) => 0.8,
423 (AgeGroup::MiddleAged, AgeGroup::Senior)
424 | (AgeGroup::Senior, AgeGroup::MiddleAged) => 0.6,
425 (AgeGroup::Child, AgeGroup::YoungAdult)
426 | (AgeGroup::YoungAdult, AgeGroup::Child) => 0.4,
427 _ => 0.2,
428 }
429 }
430 _ => 0.05,
431 };
432 similarity += 0.15 * age_similarity;
433 }
434 total_weight += 0.15;
435
436 if voice1.accent == voice2.accent {
438 similarity += 0.2;
439 } else if voice1.accent.is_some() && voice2.accent.is_some() {
440 let accent_similarity =
442 if let (Some(ref a1), Some(ref a2)) = (&voice1.accent, &voice2.accent) {
443 if (a1.contains("american") && a2.contains("canadian"))
444 || (a2.contains("american") && a1.contains("canadian"))
445 {
446 0.8
447 } else if (a1.contains("british") && a2.contains("australian"))
448 || (a2.contains("british") && a1.contains("australian"))
449 {
450 0.7
451 } else {
452 0.3
453 }
454 } else {
455 0.1
456 };
457 similarity += 0.2 * accent_similarity;
458 }
459 total_weight += 0.2;
460
461 let pitch_diff = (voice1.pitch.mean_f0 - voice2.pitch.mean_f0).abs();
463 let pitch_similarity = if pitch_diff < 10.0 {
464 1.0 } else if pitch_diff < 50.0 {
466 1.0 - (pitch_diff - 10.0) / 40.0 } else {
468 (1.0 - (pitch_diff / 200.0)).max(0.0) };
470 similarity += pitch_similarity * 0.2;
471 total_weight += 0.2;
472
473 let formant_diff = (voice1.spectral.formant_shift - voice2.spectral.formant_shift).abs();
475 let spectral_similarity = 1.0 - formant_diff.min(1.0);
476 similarity += spectral_similarity * 0.1;
477 total_weight += 0.1;
478
479 let breathiness_diff = (voice1.quality.breathiness - voice2.quality.breathiness).abs();
481 let roughness_diff = (voice1.quality.roughness - voice2.quality.roughness).abs();
482 let quality_similarity = 1.0 - ((breathiness_diff + roughness_diff) / 2.0).min(1.0);
483 similarity += quality_similarity * 0.1;
484 total_weight += 0.1;
485
486 similarity / total_weight.max(1e-10)
488 }
489}
490
491impl Default for VoiceMetadata {
492 fn default() -> Self {
493 Self {
494 language: String::new(),
495 accent: None,
496 gender: None,
497 age_group: None,
498 recording_environment: None,
499 tags: Vec::new(),
500 created: Instant::now(),
501 modified: None,
502 }
503 }
504}
505
506impl Default for DatabaseMetadata {
507 fn default() -> Self {
508 Self {
509 total_voices: 0,
510 total_duration: 0.0,
511 languages: Vec::new(),
512 last_updated: Instant::now(),
513 version: "1.0.0".to_string(),
514 index_stats: IndexStatistics::default(),
515 }
516 }
517}
518
519impl Default for IndexStatistics {
520 fn default() -> Self {
521 Self {
522 embedding_index_size: 0,
523 characteristic_index_size: 0,
524 search_performance: SearchPerformanceMetrics::default(),
525 }
526 }
527}
528
529impl Default for SearchPerformanceMetrics {
530 fn default() -> Self {
531 Self {
532 avg_search_time: 0.0,
533 cache_hit_rate: 0.0,
534 index_efficiency: 1.0,
535 }
536 }
537}