1use serde::{Deserialize, Serialize};
4use std::collections::HashMap;
5use std::str::FromStr;
6use std::time::{Duration, SystemTime};
7
8#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
10pub enum ConversionType {
11 #[default]
13 SpeakerConversion,
14 AgeTransformation,
16 GenderTransformation,
18 PitchShift,
20 SpeedTransformation,
22 VoiceMorphing,
24 EmotionalTransformation,
26 ZeroShotConversion,
28 PassThrough,
30 Custom(String),
32}
33
34impl ConversionType {
35 pub fn as_str(&self) -> &str {
37 match self {
38 ConversionType::SpeakerConversion => "speaker_conversion",
39 ConversionType::AgeTransformation => "age_transformation",
40 ConversionType::GenderTransformation => "gender_transformation",
41 ConversionType::PitchShift => "pitch_shift",
42 ConversionType::SpeedTransformation => "speed_transformation",
43 ConversionType::VoiceMorphing => "voice_morphing",
44 ConversionType::EmotionalTransformation => "emotional_transformation",
45 ConversionType::ZeroShotConversion => "zero_shot_conversion",
46 ConversionType::PassThrough => "pass_through",
47 ConversionType::Custom(name) => name,
48 }
49 }
50
51 pub fn parse_type(s: &str) -> Option<Self> {
53 match s.to_lowercase().as_str() {
54 "speaker_conversion" => Some(ConversionType::SpeakerConversion),
55 "age_transformation" => Some(ConversionType::AgeTransformation),
56 "gender_transformation" => Some(ConversionType::GenderTransformation),
57 "pitch_shift" => Some(ConversionType::PitchShift),
58 "speed_transformation" => Some(ConversionType::SpeedTransformation),
59 "voice_morphing" => Some(ConversionType::VoiceMorphing),
60 "emotional_transformation" => Some(ConversionType::EmotionalTransformation),
61 "zero_shot_conversion" => Some(ConversionType::ZeroShotConversion),
62 "pass_through" => Some(ConversionType::PassThrough),
63 _ => Some(ConversionType::Custom(s.to_string())),
64 }
65 }
66
67 pub fn supports_realtime(&self) -> bool {
69 match self {
70 ConversionType::PitchShift => true,
71 ConversionType::SpeedTransformation => true,
72 ConversionType::SpeakerConversion => true,
73 ConversionType::VoiceMorphing => false, ConversionType::AgeTransformation => true,
75 ConversionType::GenderTransformation => true,
76 ConversionType::EmotionalTransformation => true,
77 ConversionType::ZeroShotConversion => false, ConversionType::PassThrough => true, ConversionType::Custom(_) => false, }
81 }
82}
83
84impl FromStr for ConversionType {
85 type Err = String;
86
87 fn from_str(s: &str) -> Result<Self, Self::Err> {
88 Self::parse_type(s).ok_or_else(|| format!("Unknown conversion type: {s}"))
89 }
90}
91
92#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
94pub struct VoiceCharacteristics {
95 pub pitch: PitchCharacteristics,
97 pub timing: TimingCharacteristics,
99 pub spectral: SpectralCharacteristics,
101 pub quality: QualityCharacteristics,
103 pub age_group: Option<AgeGroup>,
105 pub gender: Option<Gender>,
107 pub accent: Option<String>,
109 pub custom_params: HashMap<String, f32>,
111}
112
113impl VoiceCharacteristics {
114 pub fn new() -> Self {
116 Self::default()
117 }
118
119 pub fn for_age(age_group: AgeGroup) -> Self {
121 let mut chars = Self::new();
122 chars.age_group = Some(age_group);
123
124 match age_group {
126 AgeGroup::Child => {
127 chars.pitch.mean_f0 = 250.0; chars.timing.speaking_rate = 1.1; chars.quality.breathiness = 0.2;
130 }
131 AgeGroup::Teen => {
132 chars.pitch.mean_f0 = 200.0;
133 chars.timing.speaking_rate = 1.2; chars.quality.roughness = 0.1;
135 }
136 AgeGroup::YoungAdult => {
137 chars.pitch.mean_f0 = 150.0;
138 chars.timing.speaking_rate = 1.0; }
140 AgeGroup::Adult => {
141 chars.pitch.mean_f0 = 145.0;
142 chars.timing.speaking_rate = 0.98; chars.quality.stability = 0.85;
144 }
145 AgeGroup::MiddleAged => {
146 chars.pitch.mean_f0 = 140.0;
147 chars.timing.speaking_rate = 0.95; chars.quality.stability = 0.9;
149 }
150 AgeGroup::Senior => {
151 chars.pitch.mean_f0 = 130.0;
152 chars.timing.speaking_rate = 0.85; chars.quality.breathiness = 0.3;
154 chars.quality.roughness = 0.2;
155 }
156 AgeGroup::Unknown => {}
157 }
158
159 chars
160 }
161
162 pub fn for_gender(gender: Gender) -> Self {
164 let mut chars = Self::new();
165 chars.gender = Some(gender);
166
167 match gender {
169 Gender::Male => {
170 chars.pitch.mean_f0 = 120.0; chars.spectral.formant_shift = -0.1; chars.quality.roughness = 0.15;
173 }
174 Gender::Female => {
175 chars.pitch.mean_f0 = 200.0; chars.spectral.formant_shift = 0.1; chars.quality.breathiness = 0.1;
178 }
179 Gender::NonBinary | Gender::Other | Gender::Unknown => {
180 chars.pitch.mean_f0 = 160.0; }
182 }
183
184 chars
185 }
186
187 pub fn interpolate(&self, other: &Self, factor: f32) -> Self {
189 let t = factor.clamp(0.0, 1.0);
190 let inv_t = 1.0 - t;
191
192 let mut result = self.clone();
193
194 result.pitch.mean_f0 = self.pitch.mean_f0 * inv_t + other.pitch.mean_f0 * t;
196 result.pitch.range = self.pitch.range * inv_t + other.pitch.range * t;
197 result.pitch.jitter = self.pitch.jitter * inv_t + other.pitch.jitter * t;
198
199 result.timing.speaking_rate =
201 self.timing.speaking_rate * inv_t + other.timing.speaking_rate * t;
202 result.timing.pause_duration =
203 self.timing.pause_duration * inv_t + other.timing.pause_duration * t;
204
205 result.spectral.formant_shift =
207 self.spectral.formant_shift * inv_t + other.spectral.formant_shift * t;
208 result.spectral.brightness =
209 self.spectral.brightness * inv_t + other.spectral.brightness * t;
210
211 result.quality.breathiness =
213 self.quality.breathiness * inv_t + other.quality.breathiness * t;
214 result.quality.roughness = self.quality.roughness * inv_t + other.quality.roughness * t;
215 result.quality.stability = self.quality.stability * inv_t + other.quality.stability * t;
216
217 for (key, &value) in &self.custom_params {
219 if let Some(&other_value) = other.custom_params.get(key) {
220 result
221 .custom_params
222 .insert(key.clone(), value * inv_t + other_value * t);
223 }
224 }
225
226 result
227 }
228}
229
230#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
232pub struct PitchCharacteristics {
233 pub mean_f0: f32,
235 pub range: f32,
237 pub jitter: f32,
239 pub stability: f32,
241}
242
243impl Default for PitchCharacteristics {
244 fn default() -> Self {
245 Self {
246 mean_f0: 150.0,
247 range: 12.0, jitter: 0.1,
249 stability: 0.8,
250 }
251 }
252}
253
254#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
256pub struct TimingCharacteristics {
257 pub speaking_rate: f32,
259 pub pause_duration: f32,
261 pub rhythm_regularity: f32,
263}
264
265impl Default for TimingCharacteristics {
266 fn default() -> Self {
267 Self {
268 speaking_rate: 1.0,
269 pause_duration: 1.0,
270 rhythm_regularity: 0.7,
271 }
272 }
273}
274
275#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
277pub struct SpectralCharacteristics {
278 pub formant_shift: f32,
280 pub brightness: f32,
282 pub spectral_tilt: f32,
284 pub harmonicity: f32,
286}
287
288impl Default for SpectralCharacteristics {
289 fn default() -> Self {
290 Self {
291 formant_shift: 0.0,
292 brightness: 0.0,
293 spectral_tilt: 0.0,
294 harmonicity: 0.8,
295 }
296 }
297}
298
299#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
301pub struct QualityCharacteristics {
302 pub breathiness: f32,
304 pub roughness: f32,
306 pub stability: f32,
308 pub resonance: f32,
310}
311
312impl Default for QualityCharacteristics {
313 fn default() -> Self {
314 Self {
315 breathiness: 0.1,
316 roughness: 0.1,
317 stability: 0.8,
318 resonance: 0.7,
319 }
320 }
321}
322
323#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
325pub enum AgeGroup {
326 Child,
328 Teen,
330 YoungAdult,
332 Adult,
334 MiddleAged,
336 Senior,
338 #[default]
340 Unknown,
341}
342
343#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
345pub enum Gender {
346 Male,
348 Female,
350 NonBinary,
352 Other,
354 #[default]
356 Unknown,
357}
358
359#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
361pub struct ConversionTarget {
362 pub characteristics: VoiceCharacteristics,
364 pub speaker_id: Option<String>,
366 pub reference_samples: Vec<AudioSample>,
368 pub strength: f32,
370 pub preserve_original: f32,
372}
373
374impl ConversionTarget {
375 pub fn new(characteristics: VoiceCharacteristics) -> Self {
377 Self {
378 characteristics,
379 speaker_id: None,
380 reference_samples: Vec::new(),
381 strength: 1.0,
382 preserve_original: 0.0,
383 }
384 }
385
386 pub fn with_speaker_id(mut self, speaker_id: String) -> Self {
388 self.speaker_id = Some(speaker_id);
389 self
390 }
391
392 pub fn with_reference_sample(mut self, sample: AudioSample) -> Self {
394 self.reference_samples.push(sample);
395 self
396 }
397
398 pub fn with_strength(mut self, strength: f32) -> Self {
400 self.strength = strength.clamp(0.0, 1.0);
401 self
402 }
403
404 pub fn with_preservation(mut self, preserve: f32) -> Self {
406 self.preserve_original = preserve.clamp(0.0, 1.0);
407 self
408 }
409}
410
411#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
413pub struct AudioSample {
414 pub id: String,
416 pub audio: Vec<f32>,
418 pub sample_rate: u32,
420 pub duration: f32,
422 pub metadata: HashMap<String, String>,
424}
425
426impl AudioSample {
427 pub fn new(id: String, audio: Vec<f32>, sample_rate: u32) -> Self {
429 let duration = audio.len() as f32 / sample_rate as f32;
430 Self {
431 id,
432 audio,
433 sample_rate,
434 duration,
435 metadata: HashMap::new(),
436 }
437 }
438
439 pub fn with_metadata(mut self, key: String, value: String) -> Self {
441 self.metadata.insert(key, value);
442 self
443 }
444}
445
446#[derive(Debug, Clone, Serialize, Deserialize)]
448pub struct ConversionRequest {
449 pub id: String,
451 pub source_audio: Vec<f32>,
453 pub source_sample_rate: u32,
455 pub conversion_type: ConversionType,
457 pub target: ConversionTarget,
459 pub realtime: bool,
461 pub quality_level: f32,
463 pub parameters: HashMap<String, f32>,
465 pub timestamp: SystemTime,
467}
468
469impl ConversionRequest {
470 pub fn new(
472 id: String,
473 source_audio: Vec<f32>,
474 source_sample_rate: u32,
475 conversion_type: ConversionType,
476 target: ConversionTarget,
477 ) -> Self {
478 Self {
479 id,
480 source_audio,
481 source_sample_rate,
482 conversion_type,
483 target,
484 realtime: false,
485 quality_level: 0.8,
486 parameters: HashMap::new(),
487 timestamp: SystemTime::now(),
488 }
489 }
490
491 pub fn with_realtime(mut self, realtime: bool) -> Self {
493 self.realtime = realtime;
494 self
495 }
496
497 pub fn with_quality_level(mut self, level: f32) -> Self {
499 self.quality_level = level.clamp(0.0, 1.0);
500 self
501 }
502
503 pub fn with_parameter(mut self, key: String, value: f32) -> Self {
505 self.parameters.insert(key, value);
506 self
507 }
508
509 pub fn validate(&self) -> crate::Result<()> {
511 if self.source_audio.is_empty() {
512 return Err(crate::Error::Validation {
513 message: "Source audio cannot be empty".to_string(),
514 field: Some("source_audio".to_string()),
515 expected: Some("Non-empty audio data".to_string()),
516 actual: Some("Empty audio data".to_string()),
517 context: None,
518 recovery_suggestions: Box::new(vec![
519 "Provide valid audio data".to_string(),
520 "Check audio file loading".to_string(),
521 ]),
522 });
523 }
524
525 if self.source_sample_rate == 0 {
526 return Err(crate::Error::Validation {
527 message: "Source sample rate must be positive".to_string(),
528 field: Some("source_sample_rate".to_string()),
529 expected: Some("Positive sample rate".to_string()),
530 actual: Some(format!("{}", self.source_sample_rate)),
531 context: None,
532 recovery_suggestions: Box::new(vec![
533 "Set sample rate to a positive value (e.g., 44100, 48000)".to_string(),
534 "Check audio metadata".to_string(),
535 ]),
536 });
537 }
538
539 if self.realtime && !self.conversion_type.supports_realtime() {
540 return Err(crate::Error::Validation {
541 message: format!(
542 "Conversion type {:?} does not support real-time processing",
543 self.conversion_type
544 ),
545 field: Some("realtime".to_string()),
546 expected: Some("False for non-realtime conversion types".to_string()),
547 actual: Some("True".to_string()),
548 context: None,
549 recovery_suggestions: Box::new(vec![
550 "Set realtime to false".to_string(),
551 "Use a different conversion type that supports real-time processing"
552 .to_string(),
553 ]),
554 });
555 }
556
557 Ok(())
558 }
559
560 pub fn source_duration(&self) -> f32 {
562 self.source_audio.len() as f32 / self.source_sample_rate as f32
563 }
564}
565
566#[derive(Debug, Clone, Serialize, Deserialize)]
568pub struct ConversionResult {
569 pub request_id: String,
571 pub converted_audio: Vec<f32>,
573 pub output_sample_rate: u32,
575 pub quality_metrics: HashMap<String, f32>,
577 pub artifacts: Option<DetectedArtifacts>,
579 pub objective_quality: Option<ObjectiveQualityMetrics>,
581 pub processing_time: Duration,
583 pub conversion_type: ConversionType,
585 pub success: bool,
587 pub error_message: Option<String>,
589 pub timestamp: SystemTime,
591}
592
593#[derive(Debug, Clone, Serialize, Deserialize)]
595pub struct DetectedArtifacts {
596 pub overall_score: f32,
598 pub artifact_types: HashMap<String, f32>,
600 pub artifact_count: usize,
602 pub quality_assessment: QualityAssessment,
604}
605
606#[derive(Debug, Clone, Serialize, Deserialize)]
608pub struct QualityAssessment {
609 pub overall_quality: f32,
611 pub naturalness: f32,
613 pub clarity: f32,
615 pub consistency: f32,
617 pub recommended_adjustments: Vec<QualityAdjustment>,
619}
620
621#[derive(Debug, Clone, Serialize, Deserialize)]
623pub struct QualityAdjustment {
624 pub adjustment_type: String,
626 pub strength: f32,
628 pub expected_improvement: f32,
630}
631
632#[derive(Debug, Clone, Serialize, Deserialize)]
634pub struct ObjectiveQualityMetrics {
635 pub overall_score: f32,
637 pub spectral_similarity: f32,
639 pub temporal_consistency: f32,
641 pub prosodic_preservation: f32,
643 pub naturalness: f32,
645 pub perceptual_quality: f32,
647 pub snr_estimate: f32,
649 pub segmental_snr: f32,
651}
652
653impl ConversionResult {
654 pub fn success(
656 request_id: String,
657 converted_audio: Vec<f32>,
658 output_sample_rate: u32,
659 processing_time: Duration,
660 conversion_type: ConversionType,
661 ) -> Self {
662 Self {
663 request_id,
664 converted_audio,
665 output_sample_rate,
666 quality_metrics: HashMap::new(),
667 artifacts: None,
668 objective_quality: None,
669 processing_time,
670 conversion_type,
671 success: true,
672 error_message: None,
673 timestamp: SystemTime::now(),
674 }
675 }
676
677 pub fn failure(
679 request_id: String,
680 error_message: String,
681 processing_time: Duration,
682 conversion_type: ConversionType,
683 ) -> Self {
684 Self {
685 request_id,
686 converted_audio: Vec::new(),
687 output_sample_rate: 0,
688 quality_metrics: HashMap::new(),
689 artifacts: None,
690 objective_quality: None,
691 processing_time,
692 conversion_type,
693 success: false,
694 error_message: Some(error_message),
695 timestamp: SystemTime::now(),
696 }
697 }
698
699 pub fn with_quality_metric(mut self, name: String, value: f32) -> Self {
701 self.quality_metrics.insert(name, value);
702 self
703 }
704
705 pub fn with_artifacts(mut self, artifacts: DetectedArtifacts) -> Self {
707 self.artifacts = Some(artifacts);
708 self
709 }
710
711 pub fn with_objective_quality(mut self, quality: ObjectiveQualityMetrics) -> Self {
713 self.objective_quality = Some(quality);
714 self
715 }
716
717 pub fn output_duration(&self) -> f32 {
719 if self.output_sample_rate == 0 {
720 return 0.0;
721 }
722 self.converted_audio.len() as f32 / self.output_sample_rate as f32
723 }
724}
725
726#[cfg(test)]
727mod tests {
728 use super::*;
729
730 #[test]
731 fn test_conversion_type_properties() {
732 assert!(ConversionType::PitchShift.supports_realtime());
733 assert!(ConversionType::SpeakerConversion.supports_realtime());
734 assert!(!ConversionType::VoiceMorphing.supports_realtime());
735
736 assert_eq!(ConversionType::PitchShift.as_str(), "pitch_shift");
737 assert_eq!(
738 ConversionType::from_str("pitch_shift").ok(),
739 Some(ConversionType::PitchShift)
740 );
741 }
742
743 #[test]
744 fn test_voice_characteristics_age() {
745 let child_chars = VoiceCharacteristics::for_age(AgeGroup::Child);
746 let senior_chars = VoiceCharacteristics::for_age(AgeGroup::Senior);
747
748 assert!(child_chars.pitch.mean_f0 > senior_chars.pitch.mean_f0);
749 assert!(child_chars.timing.speaking_rate > senior_chars.timing.speaking_rate);
750 }
751
752 #[test]
753 fn test_voice_characteristics_gender() {
754 let male_chars = VoiceCharacteristics::for_gender(Gender::Male);
755 let female_chars = VoiceCharacteristics::for_gender(Gender::Female);
756
757 assert!(male_chars.pitch.mean_f0 < female_chars.pitch.mean_f0);
758 assert!(male_chars.spectral.formant_shift < female_chars.spectral.formant_shift);
759 }
760
761 #[test]
762 fn test_voice_characteristics_interpolation() {
763 let chars1 = VoiceCharacteristics::for_gender(Gender::Male);
764 let chars2 = VoiceCharacteristics::for_gender(Gender::Female);
765
766 let interpolated = chars1.interpolate(&chars2, 0.5);
767
768 let expected_f0 = (chars1.pitch.mean_f0 + chars2.pitch.mean_f0) / 2.0;
769 assert!((interpolated.pitch.mean_f0 - expected_f0).abs() < 0.001);
770 }
771
772 #[test]
773 fn test_conversion_target() {
774 let chars = VoiceCharacteristics::for_age(AgeGroup::YoungAdult);
775 let target = ConversionTarget::new(chars)
776 .with_speaker_id("speaker123".to_string())
777 .with_strength(0.8)
778 .with_preservation(0.2);
779
780 assert_eq!(target.speaker_id, Some("speaker123".to_string()));
781 assert_eq!(target.strength, 0.8);
782 assert_eq!(target.preserve_original, 0.2);
783 }
784
785 #[test]
786 fn test_audio_sample() {
787 let audio = vec![0.1, -0.2, 0.3, -0.4];
788 let sample = AudioSample::new("test".to_string(), audio.clone(), 16000)
789 .with_metadata("quality".to_string(), "high".to_string());
790
791 assert_eq!(sample.audio, audio);
792 assert_eq!(sample.sample_rate, 16000);
793 assert_eq!(sample.duration, 4.0 / 16000.0);
794 assert_eq!(sample.metadata.get("quality"), Some(&"high".to_string()));
795 }
796
797 #[test]
798 fn test_conversion_request_validation() {
799 let chars = VoiceCharacteristics::default();
800 let target = ConversionTarget::new(chars);
801
802 let request = ConversionRequest::new(
804 "req1".to_string(),
805 vec![0.1, 0.2, 0.3],
806 16000,
807 ConversionType::PitchShift,
808 target.clone(),
809 );
810 assert!(request.validate().is_ok());
811
812 let invalid_request = ConversionRequest::new(
814 "req2".to_string(),
815 vec![],
816 16000,
817 ConversionType::PitchShift,
818 target.clone(),
819 );
820 assert!(invalid_request.validate().is_err());
821
822 let realtime_request = ConversionRequest::new(
824 "req3".to_string(),
825 vec![0.1, 0.2],
826 16000,
827 ConversionType::VoiceMorphing,
828 target,
829 )
830 .with_realtime(true);
831 assert!(realtime_request.validate().is_err());
832 }
833
834 #[test]
835 fn test_conversion_result() {
836 let result = ConversionResult::success(
837 "req1".to_string(),
838 vec![0.1, 0.2, 0.3, 0.4],
839 22050,
840 Duration::from_millis(100),
841 ConversionType::PitchShift,
842 );
843
844 assert!(result.success);
845 assert_eq!(result.output_duration(), 4.0 / 22050.0);
846 }
847}