1use std::collections::HashMap;
7
8#[allow(dead_code)]
12pub struct PhonemeEvent {
13 pub phoneme: String,
15 pub start: f32,
17 pub end: f32,
19}
20
21#[allow(dead_code)]
23pub struct BakerConfig {
24 pub fps: f32,
26 pub blend_window: f32,
28 pub emphasis_scale: f32,
30 pub silence_phoneme: String,
32}
33
34impl Default for BakerConfig {
35 fn default() -> Self {
36 Self {
37 fps: 30.0,
38 blend_window: 0.05,
39 emphasis_scale: 1.0,
40 silence_phoneme: "SIL".to_string(),
41 }
42 }
43}
44
45#[allow(dead_code)]
47pub struct BakedLipSync {
48 pub fps: f32,
50 pub frames: Vec<HashMap<String, f32>>,
52 pub duration: f32,
54}
55
56#[allow(dead_code)]
61pub fn build_default_viseme_map() -> HashMap<String, HashMap<String, f32>> {
62 let mut m: HashMap<String, HashMap<String, f32>> = HashMap::new();
63
64 m.insert(
66 "SIL".into(),
67 [
68 ("mouth_open".into(), 0.0),
69 ("lip_round".into(), 0.0),
70 ("lip_wide".into(), 0.0),
71 ("teeth_show".into(), 0.0),
72 ("jaw_drop".into(), 0.0),
73 ]
74 .into_iter()
75 .collect(),
76 );
77
78 m.insert(
80 "AA".into(),
81 [
82 ("mouth_open".into(), 0.9),
83 ("lip_round".into(), 0.0),
84 ("lip_wide".into(), 0.4),
85 ("teeth_show".into(), 0.6),
86 ("jaw_drop".into(), 0.8),
87 ]
88 .into_iter()
89 .collect(),
90 );
91
92 m.insert(
94 "AE".into(),
95 [
96 ("mouth_open".into(), 0.6),
97 ("lip_round".into(), 0.0),
98 ("lip_wide".into(), 0.5),
99 ("teeth_show".into(), 0.4),
100 ("jaw_drop".into(), 0.5),
101 ]
102 .into_iter()
103 .collect(),
104 );
105
106 m.insert(
108 "IY".into(),
109 [
110 ("mouth_open".into(), 0.2),
111 ("lip_round".into(), 0.0),
112 ("lip_wide".into(), 0.9),
113 ("teeth_show".into(), 0.5),
114 ("jaw_drop".into(), 0.1),
115 ]
116 .into_iter()
117 .collect(),
118 );
119
120 m.insert(
122 "UW".into(),
123 [
124 ("mouth_open".into(), 0.3),
125 ("lip_round".into(), 0.9),
126 ("lip_wide".into(), 0.0),
127 ("teeth_show".into(), 0.0),
128 ("jaw_drop".into(), 0.2),
129 ]
130 .into_iter()
131 .collect(),
132 );
133
134 m.insert(
136 "OW".into(),
137 [
138 ("mouth_open".into(), 0.5),
139 ("lip_round".into(), 0.7),
140 ("lip_wide".into(), 0.0),
141 ("teeth_show".into(), 0.1),
142 ("jaw_drop".into(), 0.4),
143 ]
144 .into_iter()
145 .collect(),
146 );
147
148 for ph in &["B", "P", "M"] {
150 m.insert(
151 ph.to_string(),
152 [
153 ("mouth_open".into(), 0.0),
154 ("lip_round".into(), 0.0),
155 ("lip_wide".into(), 0.0),
156 ("teeth_show".into(), 0.0),
157 ("jaw_drop".into(), 0.0),
158 ]
159 .into_iter()
160 .collect(),
161 );
162 }
163
164 for ph in &["F", "V"] {
166 m.insert(
167 ph.to_string(),
168 [
169 ("mouth_open".into(), 0.1),
170 ("lip_round".into(), 0.0),
171 ("lip_wide".into(), 0.3),
172 ("teeth_show".into(), 0.8),
173 ("jaw_drop".into(), 0.1),
174 ]
175 .into_iter()
176 .collect(),
177 );
178 }
179
180 for ph in &["TH", "DH"] {
182 m.insert(
183 ph.to_string(),
184 [
185 ("mouth_open".into(), 0.15),
186 ("lip_round".into(), 0.0),
187 ("lip_wide".into(), 0.2),
188 ("teeth_show".into(), 0.7),
189 ("jaw_drop".into(), 0.1),
190 ]
191 .into_iter()
192 .collect(),
193 );
194 }
195
196 for ph in &["S", "Z"] {
198 m.insert(
199 ph.to_string(),
200 [
201 ("mouth_open".into(), 0.05),
202 ("lip_round".into(), 0.0),
203 ("lip_wide".into(), 0.4),
204 ("teeth_show".into(), 0.6),
205 ("jaw_drop".into(), 0.05),
206 ]
207 .into_iter()
208 .collect(),
209 );
210 }
211
212 for ph in &["CH", "JH", "SH", "ZH"] {
214 m.insert(
215 ph.to_string(),
216 [
217 ("mouth_open".into(), 0.2),
218 ("lip_round".into(), 0.4),
219 ("lip_wide".into(), 0.1),
220 ("teeth_show".into(), 0.3),
221 ("jaw_drop".into(), 0.15),
222 ]
223 .into_iter()
224 .collect(),
225 );
226 }
227
228 m.insert(
230 "R".into(),
231 [
232 ("mouth_open".into(), 0.2),
233 ("lip_round".into(), 0.3),
234 ("lip_wide".into(), 0.0),
235 ("teeth_show".into(), 0.1),
236 ("jaw_drop".into(), 0.2),
237 ]
238 .into_iter()
239 .collect(),
240 );
241
242 for ph in &["L", "N", "D", "T"] {
244 m.insert(
245 ph.to_string(),
246 [
247 ("mouth_open".into(), 0.3),
248 ("lip_round".into(), 0.0),
249 ("lip_wide".into(), 0.3),
250 ("teeth_show".into(), 0.3),
251 ("jaw_drop".into(), 0.2),
252 ]
253 .into_iter()
254 .collect(),
255 );
256 }
257
258 m
259}
260
261#[allow(dead_code)]
264pub fn active_phonemes_at(
265 events: &[PhonemeEvent],
266 t: f32,
267 blend_window: f32,
268) -> Vec<(String, f32)> {
269 let mut contributions: Vec<(String, f32)> = Vec::new();
270
271 for ev in events {
272 if t < ev.start - blend_window || t > ev.end + blend_window {
273 continue;
274 }
275
276 let weight = if t < ev.start {
277 let d = ev.start - t;
279 1.0 - (d / blend_window).clamp(0.0, 1.0)
280 } else if t > ev.end {
281 let d = t - ev.end;
283 1.0 - (d / blend_window).clamp(0.0, 1.0)
284 } else {
285 1.0
286 };
287
288 if weight > 0.0 {
289 contributions.push((ev.phoneme.clone(), weight));
290 }
291 }
292
293 let total: f32 = contributions.iter().map(|(_, w)| w).sum();
295 if total > 1.0 {
296 for (_, w) in &mut contributions {
297 *w /= total;
298 }
299 }
300 contributions
301}
302
303#[allow(dead_code)]
305pub fn blend_viseme_weights(
306 contributions: &[(String, f32)],
307 viseme_map: &HashMap<String, HashMap<String, f32>>,
308) -> HashMap<String, f32> {
309 let mut result: HashMap<String, f32> = HashMap::new();
310
311 for (phoneme, weight) in contributions {
312 if let Some(morphs) = viseme_map.get(phoneme) {
313 for (morph, &v) in morphs {
314 *result.entry(morph.clone()).or_insert(0.0) += v * weight;
315 }
316 }
317 }
318 result
319}
320
321#[allow(dead_code)]
323pub fn bake_phoneme_sequence(
324 events: &[PhonemeEvent],
325 viseme_map: &HashMap<String, HashMap<String, f32>>,
326 cfg: &BakerConfig,
327) -> BakedLipSync {
328 let duration = events.iter().map(|e| e.end).fold(0.0_f32, f32::max);
330 let frame_count = (duration * cfg.fps).ceil() as usize + 1;
331
332 let silence_map: HashMap<String, f32> = viseme_map
333 .get(&cfg.silence_phoneme)
334 .cloned()
335 .unwrap_or_default();
336
337 let frames: Vec<HashMap<String, f32>> = (0..frame_count)
338 .map(|i| {
339 let t = (i as f32) / cfg.fps;
340 let contributions = active_phonemes_at(events, t, cfg.blend_window);
341
342 let mut weights = if contributions.is_empty() {
343 silence_map.clone()
344 } else {
345 blend_viseme_weights(&contributions, viseme_map)
346 };
347
348 if (cfg.emphasis_scale - 1.0).abs() > f32::EPSILON {
350 for v in weights.values_mut() {
351 *v = (*v * cfg.emphasis_scale).clamp(0.0, 1.0);
352 }
353 }
354 weights
355 })
356 .collect();
357
358 BakedLipSync {
359 fps: cfg.fps,
360 frames,
361 duration,
362 }
363}
364
365#[cfg(test)]
367mod tests {
368 use super::*;
369
370 fn ev(phoneme: &str, start: f32, end: f32) -> PhonemeEvent {
371 PhonemeEvent {
372 phoneme: phoneme.to_string(),
373 start,
374 end,
375 }
376 }
377
378 #[test]
380 fn test_active_in_middle_full_weight() {
381 let events = vec![ev("AA", 0.0, 1.0)];
382 let result = active_phonemes_at(&events, 0.5, 0.05);
383 assert_eq!(result.len(), 1);
384 assert_eq!(result[0].0, "AA");
385 assert!((result[0].1 - 1.0).abs() < 1e-5);
386 }
387
388 #[test]
390 fn test_active_in_crossfade_both_present() {
391 let events = vec![ev("AA", 0.0, 1.0), ev("IY", 1.0, 2.0)];
392 let result = active_phonemes_at(&events, 0.97, 0.05);
396 let has_iy = result.iter().any(|(p, _)| p == "IY");
398 let has_aa = result.iter().any(|(p, _)| p == "AA");
399 assert!(has_aa, "AA should be active at t=0.97");
400 assert!(has_iy, "IY should be in fade-in at t=0.97");
402 }
403
404 #[test]
406 fn test_active_before_first_event_empty() {
407 let events = vec![ev("AA", 1.0, 2.0)];
408 let result = active_phonemes_at(&events, 0.0, 0.05);
409 assert!(result.is_empty());
410 }
411
412 #[test]
414 fn test_active_after_last_event_empty() {
415 let events = vec![ev("AA", 0.0, 1.0)];
416 let result = active_phonemes_at(&events, 1.2, 0.05);
417 assert!(result.is_empty());
418 }
419
420 #[test]
422 fn test_blend_viseme_weights_sum() {
423 let vm = build_default_viseme_map();
424 let contributions = vec![("AA".to_string(), 1.0_f32)];
425 let weights = blend_viseme_weights(&contributions, &vm);
426 let aa = vm.get("AA").expect("should succeed");
427 for (k, &v) in aa {
428 assert!((weights[k] - v).abs() < 1e-5, "key {} mismatch", k);
429 }
430 }
431
432 #[test]
434 fn test_bake_frame_count() {
435 let events = vec![ev("AA", 0.0, 1.0)];
436 let vm = build_default_viseme_map();
437 let cfg = BakerConfig::default();
438 let baked = bake_phoneme_sequence(&events, &vm, &cfg);
439 let expected = (1.0_f32 * 30.0).ceil() as usize + 1;
440 assert_eq!(baked.frames.len(), expected);
441 }
442
443 #[test]
445 fn test_baked_frame_has_morph_keys() {
446 let events = vec![ev("AA", 0.0, 1.0)];
447 let vm = build_default_viseme_map();
448 let cfg = BakerConfig::default();
449 let baked = bake_phoneme_sequence(&events, &vm, &cfg);
450 let frame = &baked.frames[0];
452 assert!(frame.contains_key("mouth_open"));
453 assert!(frame.contains_key("lip_round"));
454 }
455
456 #[test]
458 fn test_default_viseme_map_contains_sil() {
459 let vm = build_default_viseme_map();
460 assert!(vm.contains_key("SIL"));
461 }
462
463 #[test]
465 fn test_baker_config_defaults() {
466 let cfg = BakerConfig::default();
467 assert!((cfg.fps - 30.0).abs() < 1e-5);
468 assert!((cfg.blend_window - 0.05).abs() < 1e-5);
469 assert!((cfg.emphasis_scale - 1.0).abs() < 1e-5);
470 assert_eq!(cfg.silence_phoneme, "SIL");
471 }
472
473 #[test]
475 fn test_emphasis_scale_applies() {
476 let events = vec![ev("AA", 0.0, 1.0)];
477 let vm = build_default_viseme_map();
478 let cfg_normal = BakerConfig::default();
479 let cfg_half = BakerConfig {
480 emphasis_scale: 0.5,
481 ..Default::default()
482 };
483
484 let baked_normal = bake_phoneme_sequence(&events, &vm, &cfg_normal);
485 let baked_half = bake_phoneme_sequence(&events, &vm, &cfg_half);
486
487 let frame_idx = 5; let mouth_open_normal = baked_normal.frames[frame_idx]
489 .get("mouth_open")
490 .copied()
491 .unwrap_or(0.0);
492 let mouth_open_half = baked_half.frames[frame_idx]
493 .get("mouth_open")
494 .copied()
495 .unwrap_or(0.0);
496 if mouth_open_normal > 0.01 {
498 assert!(
499 mouth_open_half < mouth_open_normal,
500 "half scale should be smaller"
501 );
502 }
503 }
504
505 #[test]
507 fn test_blend_viseme_weights_two_contributions() {
508 let vm = build_default_viseme_map();
509 let contributions = vec![("SIL".to_string(), 0.5_f32), ("AA".to_string(), 0.5_f32)];
511 let weights = blend_viseme_weights(&contributions, &vm);
512 let expected = 0.9 * 0.5;
513 assert!((weights["mouth_open"] - expected).abs() < 1e-5);
514 }
515
516 #[test]
518 fn test_baked_fps_matches_config() {
519 let events = vec![ev("IY", 0.0, 0.5)];
520 let vm = build_default_viseme_map();
521 let cfg = BakerConfig {
522 fps: 24.0,
523 ..Default::default()
524 };
525 let baked = bake_phoneme_sequence(&events, &vm, &cfg);
526 assert!((baked.fps - 24.0).abs() < 1e-5);
527 }
528}