1#[allow(dead_code)]
8#[derive(Debug, Clone)]
9pub struct ProsodyFeatures {
10 pub pitch_hz: f32,
11 pub pitch_range: f32,
12 pub speech_rate: f32,
13 pub loudness: f32,
14 pub energy: f32,
15 pub jitter: f32,
16 pub shimmer: f32,
17 pub pause_ratio: f32,
18}
19
20#[allow(dead_code)]
21#[derive(Debug, Clone, PartialEq)]
22pub enum ProsodyEmotion {
23 Neutral,
24 Happy,
25 Sad,
26 Angry,
27 Fearful,
28 Disgusted,
29 Surprised,
30 Calm,
31}
32
33#[allow(dead_code)]
34#[derive(Debug, Clone)]
35pub struct ProsodyProfile {
36 pub emotion: ProsodyEmotion,
37 pub intensity: f32,
38 pub features: ProsodyFeatures,
39}
40
41#[allow(dead_code)]
43pub fn classify_prosody_emotion(features: &ProsodyFeatures) -> ProsodyProfile {
44 let mut scores = [
46 (ProsodyEmotion::Neutral, 0.0_f32),
47 (ProsodyEmotion::Happy, 0.0_f32),
48 (ProsodyEmotion::Sad, 0.0_f32),
49 (ProsodyEmotion::Angry, 0.0_f32),
50 (ProsodyEmotion::Fearful, 0.0_f32),
51 (ProsodyEmotion::Disgusted, 0.0_f32),
52 (ProsodyEmotion::Surprised, 0.0_f32),
53 (ProsodyEmotion::Calm, 0.0_f32),
54 ];
55
56 scores[1].1 += if features.pitch_hz > 200.0 { 1.0 } else { 0.0 };
58 scores[1].1 += if features.speech_rate > 5.0 {
59 1.5
60 } else if features.speech_rate > 4.0 {
61 0.5
62 } else {
63 0.0
64 };
65 scores[1].1 += if features.loudness > 0.6 { 1.0 } else { 0.0 };
66 scores[1].1 += if features.pause_ratio < 0.15 {
67 0.5
68 } else {
69 0.0
70 };
71 scores[1].1 += if features.jitter < 0.03 { 0.5 } else { 0.0 }; scores[2].1 += if features.pitch_hz < 150.0 { 1.0 } else { 0.0 };
75 scores[2].1 += if features.speech_rate < 2.5 { 1.0 } else { 0.0 };
76 scores[2].1 += if features.loudness < 0.4 { 1.0 } else { 0.0 };
77 scores[2].1 += if features.pause_ratio > 0.4 { 1.0 } else { 0.0 };
78 scores[2].1 += if features.jitter > 0.05 { 0.5 } else { 0.0 };
79
80 scores[3].1 += if features.pitch_range > 80.0 {
82 1.0
83 } else {
84 0.0
85 };
86 scores[3].1 += if features.energy > 0.7 { 1.0 } else { 0.0 };
87 scores[3].1 += if features.loudness > 0.7 { 1.0 } else { 0.0 };
88 scores[3].1 += if features.pause_ratio < 0.1 { 1.0 } else { 0.0 }; scores[3].1 += if features.speech_rate < 5.2 && features.speech_rate > 3.5 {
90 0.5
91 } else {
92 0.0
93 };
94
95 scores[4].1 += if features.jitter > 0.06 { 1.0 } else { 0.0 };
97 scores[4].1 += if features.shimmer > 0.06 { 1.0 } else { 0.0 };
98 scores[4].1 += if features.speech_rate > 4.5 { 0.5 } else { 0.0 };
99 scores[4].1 += if features.pause_ratio > 0.3 { 0.5 } else { 0.0 };
100
101 scores[5].1 += if features.pitch_hz < 160.0 { 0.5 } else { 0.0 };
103 scores[5].1 += if features.speech_rate < 3.0 { 0.5 } else { 0.0 };
104 scores[5].1 += if features.jitter > 0.04 { 0.5 } else { 0.0 };
105 scores[5].1 += if features.shimmer > 0.04 { 0.5 } else { 0.0 };
106
107 scores[6].1 += if features.pitch_hz > 220.0 { 1.0 } else { 0.0 };
109 scores[6].1 += if features.pitch_range > 100.0 {
110 1.0
111 } else {
112 0.0
113 };
114 scores[6].1 += if features.speech_rate > 5.0 { 0.5 } else { 0.0 };
115
116 scores[7].1 += if features.jitter < 0.02 { 1.0 } else { 0.0 };
118 scores[7].1 += if features.shimmer < 0.02 { 1.0 } else { 0.0 };
119 scores[7].1 += if features.energy < 0.45 {
120 1.0
121 } else if features.energy < 0.55 {
122 0.3
123 } else {
124 0.0
125 };
126 scores[7].1 += if features.pause_ratio > 0.2 && features.pause_ratio < 0.4 {
127 0.5
128 } else {
129 0.0
130 };
131
132 let pitch_neutral = if (features.pitch_hz - 160.0).abs() < 20.0 {
134 1.2
135 } else {
136 0.0
137 };
138 let rate_neutral = if (features.speech_rate - 3.5).abs() < 0.5 {
139 1.2
140 } else {
141 0.0
142 };
143 let loudness_neutral = if (features.loudness - 0.5).abs() < 0.1 {
144 0.8
145 } else {
146 0.0
147 };
148 let energy_neutral = if (features.energy - 0.5).abs() < 0.1 {
149 0.5
150 } else {
151 0.0
152 };
153 scores[0].1 = pitch_neutral + rate_neutral + loudness_neutral + energy_neutral;
154
155 let best = scores
156 .iter()
157 .enumerate()
158 .max_by(|a, b| {
159 a.1 .1
160 .partial_cmp(&b.1 .1)
161 .unwrap_or(std::cmp::Ordering::Equal)
162 })
163 .map(|(i, _)| i)
164 .unwrap_or(0);
165
166 let total: f32 = scores.iter().map(|s| s.1).sum();
167 let intensity = if total > 0.0 {
168 (scores[best].1 / total).clamp(0.0, 1.0)
169 } else {
170 0.5
171 };
172
173 ProsodyProfile {
174 emotion: scores[best].0.clone(),
175 intensity,
176 features: features.clone(),
177 }
178}
179
180#[allow(dead_code)]
182pub fn generate_prosody_for_emotion(emotion: &ProsodyEmotion, intensity: f32) -> ProsodyFeatures {
183 let t = intensity.clamp(0.0, 1.0);
184 let lerp = |a: f32, b: f32| a + (b - a) * t;
185
186 match emotion {
187 ProsodyEmotion::Neutral => ProsodyFeatures {
188 pitch_hz: 160.0,
189 pitch_range: 40.0,
190 speech_rate: 3.5,
191 loudness: 0.5,
192 energy: 0.5,
193 jitter: 0.01,
194 shimmer: 0.01,
195 pause_ratio: 0.25,
196 },
197 ProsodyEmotion::Happy => ProsodyFeatures {
198 pitch_hz: lerp(160.0, 230.0),
199 pitch_range: lerp(40.0, 110.0),
200 speech_rate: lerp(3.5, 5.5),
201 loudness: lerp(0.5, 0.85),
202 energy: lerp(0.5, 0.8),
203 jitter: lerp(0.01, 0.02),
204 shimmer: lerp(0.01, 0.02),
205 pause_ratio: lerp(0.25, 0.1),
206 },
207 ProsodyEmotion::Sad => ProsodyFeatures {
208 pitch_hz: lerp(160.0, 120.0),
209 pitch_range: lerp(40.0, 20.0),
210 speech_rate: lerp(3.5, 1.8),
211 loudness: lerp(0.5, 0.25),
212 energy: lerp(0.5, 0.2),
213 jitter: lerp(0.01, 0.08),
214 shimmer: lerp(0.01, 0.07),
215 pause_ratio: lerp(0.25, 0.55),
216 },
217 ProsodyEmotion::Angry => ProsodyFeatures {
218 pitch_hz: lerp(160.0, 200.0),
219 pitch_range: lerp(40.0, 120.0),
220 speech_rate: lerp(3.5, 5.0),
221 loudness: lerp(0.5, 0.95),
222 energy: lerp(0.5, 0.9),
223 jitter: lerp(0.01, 0.04),
224 shimmer: lerp(0.01, 0.05),
225 pause_ratio: lerp(0.25, 0.08),
226 },
227 ProsodyEmotion::Fearful => ProsodyFeatures {
228 pitch_hz: lerp(160.0, 210.0),
229 pitch_range: lerp(40.0, 90.0),
230 speech_rate: lerp(3.5, 5.5),
231 loudness: lerp(0.5, 0.6),
232 energy: lerp(0.5, 0.55),
233 jitter: lerp(0.01, 0.09),
234 shimmer: lerp(0.01, 0.08),
235 pause_ratio: lerp(0.25, 0.4),
236 },
237 ProsodyEmotion::Disgusted => ProsodyFeatures {
238 pitch_hz: lerp(160.0, 140.0),
239 pitch_range: lerp(40.0, 30.0),
240 speech_rate: lerp(3.5, 2.5),
241 loudness: lerp(0.5, 0.45),
242 energy: lerp(0.5, 0.4),
243 jitter: lerp(0.01, 0.06),
244 shimmer: lerp(0.01, 0.06),
245 pause_ratio: lerp(0.25, 0.35),
246 },
247 ProsodyEmotion::Surprised => ProsodyFeatures {
248 pitch_hz: lerp(160.0, 250.0),
249 pitch_range: lerp(40.0, 130.0),
250 speech_rate: lerp(3.5, 5.8),
251 loudness: lerp(0.5, 0.8),
252 energy: lerp(0.5, 0.75),
253 jitter: lerp(0.01, 0.03),
254 shimmer: lerp(0.01, 0.03),
255 pause_ratio: lerp(0.25, 0.12),
256 },
257 ProsodyEmotion::Calm => ProsodyFeatures {
258 pitch_hz: lerp(160.0, 155.0),
259 pitch_range: lerp(40.0, 20.0),
260 speech_rate: lerp(3.5, 2.8),
261 loudness: lerp(0.5, 0.35),
262 energy: lerp(0.5, 0.3),
263 jitter: lerp(0.01, 0.005),
264 shimmer: lerp(0.01, 0.005),
265 pause_ratio: lerp(0.25, 0.35),
266 },
267 }
268}
269
270#[allow(dead_code)]
272pub fn prosody_to_face_params(
273 features: &ProsodyFeatures,
274) -> std::collections::HashMap<String, f32> {
275 let mut map = std::collections::HashMap::new();
276
277 let jaw_open = (features.loudness * 0.6 + features.energy * 0.4).clamp(0.0, 1.0);
279 map.insert("jaw_open".to_string(), jaw_open);
280
281 let lip_corner_up = ((features.pitch_hz - 100.0) / 200.0).clamp(0.0, 1.0);
283 map.insert("lip_corner_up".to_string(), lip_corner_up);
284
285 let brow_raise = (features.pitch_range / 150.0).clamp(0.0, 1.0);
287 map.insert("brow_raise".to_string(), brow_raise);
288
289 let brow_furrow = (features.jitter * 5.0 + (1.0 - features.pause_ratio) * 0.2).clamp(0.0, 1.0);
291 map.insert("brow_furrow".to_string(), brow_furrow);
292
293 let lip_press = (features.energy * 0.5).clamp(0.0, 1.0);
295 map.insert("lip_press".to_string(), lip_press);
296
297 let lip_stretch = ((features.speech_rate - 2.0) / 5.0).clamp(0.0, 1.0);
299 map.insert("lip_stretch".to_string(), lip_stretch);
300
301 let cheek_raise = (features.loudness * 0.7).clamp(0.0, 1.0);
303 map.insert("cheek_raise".to_string(), cheek_raise);
304
305 map
306}
307
308#[allow(dead_code)]
310pub fn interpolate_prosody(a: &ProsodyFeatures, b: &ProsodyFeatures, t: f32) -> ProsodyFeatures {
311 let t = t.clamp(0.0, 1.0);
312 let lerp = |x: f32, y: f32| x + (y - x) * t;
313 ProsodyFeatures {
314 pitch_hz: lerp(a.pitch_hz, b.pitch_hz),
315 pitch_range: lerp(a.pitch_range, b.pitch_range),
316 speech_rate: lerp(a.speech_rate, b.speech_rate),
317 loudness: lerp(a.loudness, b.loudness),
318 energy: lerp(a.energy, b.energy),
319 jitter: lerp(a.jitter, b.jitter),
320 shimmer: lerp(a.shimmer, b.shimmer),
321 pause_ratio: lerp(a.pause_ratio, b.pause_ratio),
322 }
323}
324
325#[allow(dead_code)]
327pub fn blend_prosody_emotions(emotions: &[(ProsodyEmotion, f32)]) -> ProsodyFeatures {
328 if emotions.is_empty() {
329 return generate_prosody_for_emotion(&ProsodyEmotion::Neutral, 0.5);
330 }
331
332 let total_weight: f32 = emotions.iter().map(|(_, w)| w.max(0.0)).sum();
333 if total_weight <= 0.0 {
334 return generate_prosody_for_emotion(&ProsodyEmotion::Neutral, 0.5);
335 }
336
337 let mut result = ProsodyFeatures {
338 pitch_hz: 0.0,
339 pitch_range: 0.0,
340 speech_rate: 0.0,
341 loudness: 0.0,
342 energy: 0.0,
343 jitter: 0.0,
344 shimmer: 0.0,
345 pause_ratio: 0.0,
346 };
347
348 for (emotion, weight) in emotions {
349 let w = weight.max(0.0) / total_weight;
350 let f = generate_prosody_for_emotion(emotion, 0.7);
351 result.pitch_hz += f.pitch_hz * w;
352 result.pitch_range += f.pitch_range * w;
353 result.speech_rate += f.speech_rate * w;
354 result.loudness += f.loudness * w;
355 result.energy += f.energy * w;
356 result.jitter += f.jitter * w;
357 result.shimmer += f.shimmer * w;
358 result.pause_ratio += f.pause_ratio * w;
359 }
360
361 result
362}
363
364#[allow(dead_code)]
366pub fn prosody_similarity(a: &ProsodyFeatures, b: &ProsodyFeatures) -> f32 {
367 let normalize = |f: &ProsodyFeatures| {
369 [
370 f.pitch_hz / 300.0,
371 f.pitch_range / 200.0,
372 f.speech_rate / 8.0,
373 f.loudness,
374 f.energy,
375 f.jitter * 10.0,
376 f.shimmer * 10.0,
377 f.pause_ratio,
378 ]
379 };
380
381 let na = normalize(a);
382 let nb = normalize(b);
383
384 let dot: f32 = na.iter().zip(nb.iter()).map(|(x, y)| x * y).sum();
385 let mag_a: f32 = na.iter().map(|x| x * x).sum::<f32>().sqrt();
386 let mag_b: f32 = nb.iter().map(|x| x * x).sum::<f32>().sqrt();
387
388 if mag_a < 1e-6 || mag_b < 1e-6 {
389 return 0.0;
390 }
391
392 (dot / (mag_a * mag_b)).clamp(0.0, 1.0)
393}
394
395#[allow(dead_code)]
397pub fn normalize_prosody(features: &mut ProsodyFeatures) {
398 features.pitch_hz = features.pitch_hz.clamp(50.0, 600.0);
399 features.pitch_range = features.pitch_range.clamp(0.0, 300.0);
400 features.speech_rate = features.speech_rate.clamp(0.1, 10.0);
401 features.loudness = features.loudness.clamp(0.0, 1.0);
402 features.energy = features.energy.clamp(0.0, 1.0);
403 features.jitter = features.jitter.clamp(0.0, 1.0);
404 features.shimmer = features.shimmer.clamp(0.0, 1.0);
405 features.pause_ratio = features.pause_ratio.clamp(0.0, 1.0);
406}
407
408#[allow(dead_code)]
410pub fn prosody_to_json(features: &ProsodyFeatures) -> String {
411 format!(
412 r#"{{"pitch_hz":{:.4},"pitch_range":{:.4},"speech_rate":{:.4},"loudness":{:.4},"energy":{:.4},"jitter":{:.4},"shimmer":{:.4},"pause_ratio":{:.4}}}"#,
413 features.pitch_hz,
414 features.pitch_range,
415 features.speech_rate,
416 features.loudness,
417 features.energy,
418 features.jitter,
419 features.shimmer,
420 features.pause_ratio,
421 )
422}
423
424#[allow(dead_code)]
426pub fn dominant_prosody_emotion(profiles: &[ProsodyProfile]) -> Option<&ProsodyProfile> {
427 profiles.iter().max_by(|a, b| {
428 a.intensity
429 .partial_cmp(&b.intensity)
430 .unwrap_or(std::cmp::Ordering::Equal)
431 })
432}
433
434#[allow(dead_code)]
436pub fn speech_rate_category(rate: f32) -> &'static str {
437 if rate < 2.0 {
438 "slow"
439 } else if rate < 4.0 {
440 "normal"
441 } else if rate < 6.0 {
442 "fast"
443 } else {
444 "very_fast"
445 }
446}
447
448#[allow(dead_code)]
451pub fn estimate_arousal_valence(features: &ProsodyFeatures) -> (f32, f32) {
452 let arousal =
454 (features.energy * 0.4 + features.speech_rate / 10.0 * 0.3 + features.loudness * 0.3) * 2.0
455 - 1.0;
456
457 let valence = ((features.pitch_hz - 100.0) / 300.0 * 0.5
459 + (1.0 - features.jitter * 10.0).clamp(0.0, 1.0) * 0.3
460 + (1.0 - features.pause_ratio) * 0.2)
461 * 2.0
462 - 1.0;
463
464 (arousal.clamp(-1.0, 1.0), valence.clamp(-1.0, 1.0))
465}
466
467#[cfg(test)]
468mod tests {
469 use super::*;
470
471 fn neutral_features() -> ProsodyFeatures {
472 ProsodyFeatures {
473 pitch_hz: 160.0,
474 pitch_range: 40.0,
475 speech_rate: 3.5,
476 loudness: 0.5,
477 energy: 0.5,
478 jitter: 0.01,
479 shimmer: 0.01,
480 pause_ratio: 0.25,
481 }
482 }
483
484 fn happy_features() -> ProsodyFeatures {
485 ProsodyFeatures {
486 pitch_hz: 230.0,
487 pitch_range: 110.0,
488 speech_rate: 5.5,
489 loudness: 0.85,
490 energy: 0.8,
491 jitter: 0.02,
492 shimmer: 0.02,
493 pause_ratio: 0.1,
494 }
495 }
496
497 fn sad_features() -> ProsodyFeatures {
498 ProsodyFeatures {
499 pitch_hz: 120.0,
500 pitch_range: 20.0,
501 speech_rate: 1.8,
502 loudness: 0.25,
503 energy: 0.2,
504 jitter: 0.08,
505 shimmer: 0.07,
506 pause_ratio: 0.55,
507 }
508 }
509
510 #[test]
511 fn test_classify_happy() {
512 let profile = classify_prosody_emotion(&happy_features());
513 assert_eq!(profile.emotion, ProsodyEmotion::Happy);
514 }
515
516 #[test]
517 fn test_classify_sad() {
518 let profile = classify_prosody_emotion(&sad_features());
519 assert_eq!(profile.emotion, ProsodyEmotion::Sad);
520 }
521
522 #[test]
523 fn test_classify_neutral() {
524 let profile = classify_prosody_emotion(&neutral_features());
525 assert_eq!(profile.emotion, ProsodyEmotion::Neutral);
526 }
527
528 #[test]
529 fn test_generate_happy_pitch_increases() {
530 let f = generate_prosody_for_emotion(&ProsodyEmotion::Happy, 1.0);
531 assert!(f.pitch_hz > 160.0);
532 }
533
534 #[test]
535 fn test_generate_sad_pitch_decreases() {
536 let f = generate_prosody_for_emotion(&ProsodyEmotion::Sad, 1.0);
537 assert!(f.pitch_hz < 160.0);
538 }
539
540 #[test]
541 fn test_prosody_to_face_params_keys() {
542 let map = prosody_to_face_params(&neutral_features());
543 assert!(map.contains_key("jaw_open"));
544 assert!(map.contains_key("brow_raise"));
545 assert!(map.contains_key("lip_corner_up"));
546 }
547
548 #[test]
549 fn test_prosody_to_face_params_range() {
550 let map = prosody_to_face_params(&happy_features());
551 for v in map.values() {
552 assert!(*v >= 0.0 && *v <= 1.0, "param out of range: {v}");
553 }
554 }
555
556 #[test]
557 fn test_interpolate_midpoint() {
558 let mid = interpolate_prosody(&neutral_features(), &happy_features(), 0.5);
559 assert!(mid.pitch_hz > 160.0 && mid.pitch_hz < 230.0);
560 }
561
562 #[test]
563 fn test_interpolate_t0_equals_a() {
564 let a = neutral_features();
565 let result = interpolate_prosody(&a, &happy_features(), 0.0);
566 assert!((result.pitch_hz - a.pitch_hz).abs() < 1e-4);
567 }
568
569 #[test]
570 fn test_blend_single_emotion() {
571 let blended = blend_prosody_emotions(&[(ProsodyEmotion::Happy, 1.0)]);
572 let expected = generate_prosody_for_emotion(&ProsodyEmotion::Happy, 0.7);
573 assert!((blended.pitch_hz - expected.pitch_hz).abs() < 1e-3);
574 }
575
576 #[test]
577 fn test_blend_empty_returns_neutral() {
578 let blended = blend_prosody_emotions(&[]);
579 assert!((blended.speech_rate - 3.5).abs() < 0.5);
580 }
581
582 #[test]
583 fn test_prosody_similarity_self() {
584 let f = neutral_features();
585 let sim = prosody_similarity(&f, &f);
586 assert!(
587 (sim - 1.0).abs() < 1e-4,
588 "self-similarity should be 1.0, got {sim}"
589 );
590 }
591
592 #[test]
593 fn test_prosody_similarity_different() {
594 let sim = prosody_similarity(&happy_features(), &sad_features());
595 assert!(sim < 1.0);
596 }
597
598 #[test]
599 fn test_normalize_prosody_clamps() {
600 let mut f = ProsodyFeatures {
601 pitch_hz: -100.0,
602 pitch_range: 9999.0,
603 speech_rate: -5.0,
604 loudness: 2.0,
605 energy: -0.5,
606 jitter: 5.0,
607 shimmer: 5.0,
608 pause_ratio: 3.0,
609 };
610 normalize_prosody(&mut f);
611 assert!(f.pitch_hz >= 50.0);
612 assert!(f.loudness <= 1.0);
613 assert!(f.jitter <= 1.0);
614 }
615
616 #[test]
617 fn test_prosody_to_json_contains_fields() {
618 let json = prosody_to_json(&neutral_features());
619 assert!(json.contains("pitch_hz"));
620 assert!(json.contains("speech_rate"));
621 }
622
623 #[test]
624 fn test_dominant_prosody_emotion() {
625 let profiles = vec![
626 ProsodyProfile {
627 emotion: ProsodyEmotion::Happy,
628 intensity: 0.3,
629 features: happy_features(),
630 },
631 ProsodyProfile {
632 emotion: ProsodyEmotion::Sad,
633 intensity: 0.8,
634 features: sad_features(),
635 },
636 ];
637 let dom = dominant_prosody_emotion(&profiles).expect("should succeed");
638 assert_eq!(dom.emotion, ProsodyEmotion::Sad);
639 }
640
641 #[test]
642 fn test_speech_rate_category() {
643 assert_eq!(speech_rate_category(1.0), "slow");
644 assert_eq!(speech_rate_category(3.0), "normal");
645 assert_eq!(speech_rate_category(5.0), "fast");
646 assert_eq!(speech_rate_category(7.0), "very_fast");
647 }
648
649 #[test]
650 fn test_estimate_arousal_valence_range() {
651 let (arousal, valence) = estimate_arousal_valence(&neutral_features());
652 assert!((-1.0..=1.0).contains(&arousal));
653 assert!((-1.0..=1.0).contains(&valence));
654 }
655
656 #[test]
657 fn test_arousal_higher_for_angry() {
658 let angry = generate_prosody_for_emotion(&ProsodyEmotion::Angry, 1.0);
659 let calm = generate_prosody_for_emotion(&ProsodyEmotion::Calm, 1.0);
660 let (a_angry, _) = estimate_arousal_valence(&angry);
661 let (a_calm, _) = estimate_arousal_valence(&calm);
662 assert!(
663 a_angry > a_calm,
664 "angry arousal {a_angry} should exceed calm {a_calm}"
665 );
666 }
667}