Skip to main content

oxihuman_morph/
speech_baker.rs

1// Copyright (C) 2026 COOLJAPAN OU (Team KitaSan)
2// SPDX-License-Identifier: Apache-2.0
3
4//! Bake a phoneme event sequence into a morph weight track for lip sync.
5
6use std::collections::HashMap;
7
8// ── Types ────────────────────────────────────────────────────────────────────
9
10/// A single phoneme occurrence in time.
11#[allow(dead_code)]
12pub struct PhonemeEvent {
13    /// Phoneme label, e.g. "AA", "B", "SIL".
14    pub phoneme: String,
15    /// Start time in seconds.
16    pub start: f32,
17    /// End time in seconds.
18    pub end: f32,
19}
20
21/// Configuration for the lip-sync baker.
22#[allow(dead_code)]
23pub struct BakerConfig {
24    /// Bake resolution in frames per second (default 30).
25    pub fps: f32,
26    /// Crossfade window in seconds (default 0.05).
27    pub blend_window: f32,
28    /// Overall weight multiplier (default 1.0).
29    pub emphasis_scale: f32,
30    /// Phoneme label used for silence (default "SIL").
31    pub silence_phoneme: String,
32}
33
34impl Default for BakerConfig {
35    fn default() -> Self {
36        Self {
37            fps: 30.0,
38            blend_window: 0.05,
39            emphasis_scale: 1.0,
40            silence_phoneme: "SIL".to_string(),
41        }
42    }
43}
44
45/// The output of a bake pass.
46#[allow(dead_code)]
47pub struct BakedLipSync {
48    /// Frames-per-second used during baking.
49    pub fps: f32,
50    /// Per-frame morph weight maps.
51    pub frames: Vec<HashMap<String, f32>>,
52    /// Total duration of the baked sequence in seconds.
53    pub duration: f32,
54}
55
56// ── Free functions ────────────────────────────────────────────────────────────
57
58/// Build a simple English phoneme → morph weight map.
59/// Morph targets: `mouth_open`, `lip_round`, `lip_wide`, `teeth_show`, `jaw_drop`.
60#[allow(dead_code)]
61pub fn build_default_viseme_map() -> HashMap<String, HashMap<String, f32>> {
62    let mut m: HashMap<String, HashMap<String, f32>> = HashMap::new();
63
64    // Silence
65    m.insert(
66        "SIL".into(),
67        [
68            ("mouth_open".into(), 0.0),
69            ("lip_round".into(), 0.0),
70            ("lip_wide".into(), 0.0),
71            ("teeth_show".into(), 0.0),
72            ("jaw_drop".into(), 0.0),
73        ]
74        .into_iter()
75        .collect(),
76    );
77
78    // /AA/ – open mouth, wide
79    m.insert(
80        "AA".into(),
81        [
82            ("mouth_open".into(), 0.9),
83            ("lip_round".into(), 0.0),
84            ("lip_wide".into(), 0.4),
85            ("teeth_show".into(), 0.6),
86            ("jaw_drop".into(), 0.8),
87        ]
88        .into_iter()
89        .collect(),
90    );
91
92    // /AE/ – mid-open, slightly wide
93    m.insert(
94        "AE".into(),
95        [
96            ("mouth_open".into(), 0.6),
97            ("lip_round".into(), 0.0),
98            ("lip_wide".into(), 0.5),
99            ("teeth_show".into(), 0.4),
100            ("jaw_drop".into(), 0.5),
101        ]
102        .into_iter()
103        .collect(),
104    );
105
106    // /IY/ – smile shape
107    m.insert(
108        "IY".into(),
109        [
110            ("mouth_open".into(), 0.2),
111            ("lip_round".into(), 0.0),
112            ("lip_wide".into(), 0.9),
113            ("teeth_show".into(), 0.5),
114            ("jaw_drop".into(), 0.1),
115        ]
116        .into_iter()
117        .collect(),
118    );
119
120    // /UW/ – round lips
121    m.insert(
122        "UW".into(),
123        [
124            ("mouth_open".into(), 0.3),
125            ("lip_round".into(), 0.9),
126            ("lip_wide".into(), 0.0),
127            ("teeth_show".into(), 0.0),
128            ("jaw_drop".into(), 0.2),
129        ]
130        .into_iter()
131        .collect(),
132    );
133
134    // /OW/ – round, mid-open
135    m.insert(
136        "OW".into(),
137        [
138            ("mouth_open".into(), 0.5),
139            ("lip_round".into(), 0.7),
140            ("lip_wide".into(), 0.0),
141            ("teeth_show".into(), 0.1),
142            ("jaw_drop".into(), 0.4),
143        ]
144        .into_iter()
145        .collect(),
146    );
147
148    // /B/ /P/ /M/ – bilabial, closed
149    for ph in &["B", "P", "M"] {
150        m.insert(
151            ph.to_string(),
152            [
153                ("mouth_open".into(), 0.0),
154                ("lip_round".into(), 0.0),
155                ("lip_wide".into(), 0.0),
156                ("teeth_show".into(), 0.0),
157                ("jaw_drop".into(), 0.0),
158            ]
159            .into_iter()
160            .collect(),
161        );
162    }
163
164    // /F/ /V/ – teeth on lower lip
165    for ph in &["F", "V"] {
166        m.insert(
167            ph.to_string(),
168            [
169                ("mouth_open".into(), 0.1),
170                ("lip_round".into(), 0.0),
171                ("lip_wide".into(), 0.3),
172                ("teeth_show".into(), 0.8),
173                ("jaw_drop".into(), 0.1),
174            ]
175            .into_iter()
176            .collect(),
177        );
178    }
179
180    // /TH/ /DH/ – tongue between teeth
181    for ph in &["TH", "DH"] {
182        m.insert(
183            ph.to_string(),
184            [
185                ("mouth_open".into(), 0.15),
186                ("lip_round".into(), 0.0),
187                ("lip_wide".into(), 0.2),
188                ("teeth_show".into(), 0.7),
189                ("jaw_drop".into(), 0.1),
190            ]
191            .into_iter()
192            .collect(),
193        );
194    }
195
196    // /S/ /Z/ – slight opening
197    for ph in &["S", "Z"] {
198        m.insert(
199            ph.to_string(),
200            [
201                ("mouth_open".into(), 0.05),
202                ("lip_round".into(), 0.0),
203                ("lip_wide".into(), 0.4),
204                ("teeth_show".into(), 0.6),
205                ("jaw_drop".into(), 0.05),
206            ]
207            .into_iter()
208            .collect(),
209        );
210    }
211
212    // /CH/ /JH/ /SH/ /ZH/ – rounded slightly
213    for ph in &["CH", "JH", "SH", "ZH"] {
214        m.insert(
215            ph.to_string(),
216            [
217                ("mouth_open".into(), 0.2),
218                ("lip_round".into(), 0.4),
219                ("lip_wide".into(), 0.1),
220                ("teeth_show".into(), 0.3),
221                ("jaw_drop".into(), 0.15),
222            ]
223            .into_iter()
224            .collect(),
225        );
226    }
227
228    // /R/ – slight rounding
229    m.insert(
230        "R".into(),
231        [
232            ("mouth_open".into(), 0.2),
233            ("lip_round".into(), 0.3),
234            ("lip_wide".into(), 0.0),
235            ("teeth_show".into(), 0.1),
236            ("jaw_drop".into(), 0.2),
237        ]
238        .into_iter()
239        .collect(),
240    );
241
242    // /L/ /N/ /D/ /T/ – neutral-ish
243    for ph in &["L", "N", "D", "T"] {
244        m.insert(
245            ph.to_string(),
246            [
247                ("mouth_open".into(), 0.3),
248                ("lip_round".into(), 0.0),
249                ("lip_wide".into(), 0.3),
250                ("teeth_show".into(), 0.3),
251                ("jaw_drop".into(), 0.2),
252            ]
253            .into_iter()
254            .collect(),
255        );
256    }
257
258    m
259}
260
261/// Return the active phonemes (and their crossfade blend weights) at time `t`.
262/// Returns a `Vec<(phoneme, blend_weight)>`. Weights sum to ≤1.0.
263#[allow(dead_code)]
264pub fn active_phonemes_at(
265    events: &[PhonemeEvent],
266    t: f32,
267    blend_window: f32,
268) -> Vec<(String, f32)> {
269    let mut contributions: Vec<(String, f32)> = Vec::new();
270
271    for ev in events {
272        if t < ev.start - blend_window || t > ev.end + blend_window {
273            continue;
274        }
275
276        let weight = if t < ev.start {
277            // Fade-in ramp before this event's start (overlap with previous).
278            let d = ev.start - t;
279            1.0 - (d / blend_window).clamp(0.0, 1.0)
280        } else if t > ev.end {
281            // Fade-out ramp after this event's end (overlap with next).
282            let d = t - ev.end;
283            1.0 - (d / blend_window).clamp(0.0, 1.0)
284        } else {
285            1.0
286        };
287
288        if weight > 0.0 {
289            contributions.push((ev.phoneme.clone(), weight));
290        }
291    }
292
293    // Normalise so weights sum to 1.0.
294    let total: f32 = contributions.iter().map(|(_, w)| w).sum();
295    if total > 1.0 {
296        for (_, w) in &mut contributions {
297            *w /= total;
298        }
299    }
300    contributions
301}
302
303/// Weighted sum of viseme morph weights from multiple phoneme contributions.
304#[allow(dead_code)]
305pub fn blend_viseme_weights(
306    contributions: &[(String, f32)],
307    viseme_map: &HashMap<String, HashMap<String, f32>>,
308) -> HashMap<String, f32> {
309    let mut result: HashMap<String, f32> = HashMap::new();
310
311    for (phoneme, weight) in contributions {
312        if let Some(morphs) = viseme_map.get(phoneme) {
313            for (morph, &v) in morphs {
314                *result.entry(morph.clone()).or_insert(0.0) += v * weight;
315            }
316        }
317    }
318    result
319}
320
321/// Bake a phoneme sequence into a [`BakedLipSync`] track.
322#[allow(dead_code)]
323pub fn bake_phoneme_sequence(
324    events: &[PhonemeEvent],
325    viseme_map: &HashMap<String, HashMap<String, f32>>,
326    cfg: &BakerConfig,
327) -> BakedLipSync {
328    // Compute duration from events.
329    let duration = events.iter().map(|e| e.end).fold(0.0_f32, f32::max);
330    let frame_count = (duration * cfg.fps).ceil() as usize + 1;
331
332    let silence_map: HashMap<String, f32> = viseme_map
333        .get(&cfg.silence_phoneme)
334        .cloned()
335        .unwrap_or_default();
336
337    let frames: Vec<HashMap<String, f32>> = (0..frame_count)
338        .map(|i| {
339            let t = (i as f32) / cfg.fps;
340            let contributions = active_phonemes_at(events, t, cfg.blend_window);
341
342            let mut weights = if contributions.is_empty() {
343                silence_map.clone()
344            } else {
345                blend_viseme_weights(&contributions, viseme_map)
346            };
347
348            // Apply emphasis scale.
349            if (cfg.emphasis_scale - 1.0).abs() > f32::EPSILON {
350                for v in weights.values_mut() {
351                    *v = (*v * cfg.emphasis_scale).clamp(0.0, 1.0);
352                }
353            }
354            weights
355        })
356        .collect();
357
358    BakedLipSync {
359        fps: cfg.fps,
360        frames,
361        duration,
362    }
363}
364
365// ── Tests ─────────────────────────────────────────────────────────────────────
366#[cfg(test)]
367mod tests {
368    use super::*;
369
370    fn ev(phoneme: &str, start: f32, end: f32) -> PhonemeEvent {
371        PhonemeEvent {
372            phoneme: phoneme.to_string(),
373            start,
374            end,
375        }
376    }
377
378    // 1. active_phonemes_at: in middle of phoneme → 100% weight
379    #[test]
380    fn test_active_in_middle_full_weight() {
381        let events = vec![ev("AA", 0.0, 1.0)];
382        let result = active_phonemes_at(&events, 0.5, 0.05);
383        assert_eq!(result.len(), 1);
384        assert_eq!(result[0].0, "AA");
385        assert!((result[0].1 - 1.0).abs() < 1e-5);
386    }
387
388    // 2. active_phonemes_at: in crossfade → both phonemes active
389    #[test]
390    fn test_active_in_crossfade_both_present() {
391        let events = vec![ev("AA", 0.0, 1.0), ev("IY", 1.0, 2.0)];
392        // At t=0.98, we're in AA's fade-out AND before IY starts: blend_window=0.05
393        // AA ends at 1.0, so fade-out covers 1.0..1.05 (after end).
394        // IY starts at 1.0, fade-in covers 0.95..1.0 (before start).
395        let result = active_phonemes_at(&events, 0.97, 0.05);
396        // Only AA should be fully active here (IY fade-in starts at 0.95).
397        let has_iy = result.iter().any(|(p, _)| p == "IY");
398        let has_aa = result.iter().any(|(p, _)| p == "AA");
399        assert!(has_aa, "AA should be active at t=0.97");
400        // IY fade-in at t=0.97: d=1.0-0.97=0.03 < blend_window=0.05, weight = 1 - 0.03/0.05 = 0.4 > 0
401        assert!(has_iy, "IY should be in fade-in at t=0.97");
402    }
403
404    // 3. active_phonemes_at: before first event → empty (no silence event)
405    #[test]
406    fn test_active_before_first_event_empty() {
407        let events = vec![ev("AA", 1.0, 2.0)];
408        let result = active_phonemes_at(&events, 0.0, 0.05);
409        assert!(result.is_empty());
410    }
411
412    // 4. active_phonemes_at: after last event (past blend_window) → empty
413    #[test]
414    fn test_active_after_last_event_empty() {
415        let events = vec![ev("AA", 0.0, 1.0)];
416        let result = active_phonemes_at(&events, 1.2, 0.05);
417        assert!(result.is_empty());
418    }
419
420    // 5. blend_viseme_weights: weighted sum is correct
421    #[test]
422    fn test_blend_viseme_weights_sum() {
423        let vm = build_default_viseme_map();
424        let contributions = vec![("AA".to_string(), 1.0_f32)];
425        let weights = blend_viseme_weights(&contributions, &vm);
426        let aa = vm.get("AA").expect("should succeed");
427        for (k, &v) in aa {
428            assert!((weights[k] - v).abs() < 1e-5, "key {} mismatch", k);
429        }
430    }
431
432    // 6. bake_phoneme_sequence: correct frame count
433    #[test]
434    fn test_bake_frame_count() {
435        let events = vec![ev("AA", 0.0, 1.0)];
436        let vm = build_default_viseme_map();
437        let cfg = BakerConfig::default();
438        let baked = bake_phoneme_sequence(&events, &vm, &cfg);
439        let expected = (1.0_f32 * 30.0).ceil() as usize + 1;
440        assert_eq!(baked.frames.len(), expected);
441    }
442
443    // 7. baked frame at a phoneme-active time has expected morph keys
444    #[test]
445    fn test_baked_frame_has_morph_keys() {
446        let events = vec![ev("AA", 0.0, 1.0)];
447        let vm = build_default_viseme_map();
448        let cfg = BakerConfig::default();
449        let baked = bake_phoneme_sequence(&events, &vm, &cfg);
450        // Frame 0 corresponds to t=0.0 (inside AA).
451        let frame = &baked.frames[0];
452        assert!(frame.contains_key("mouth_open"));
453        assert!(frame.contains_key("lip_round"));
454    }
455
456    // 8. build_default_viseme_map contains "SIL"
457    #[test]
458    fn test_default_viseme_map_contains_sil() {
459        let vm = build_default_viseme_map();
460        assert!(vm.contains_key("SIL"));
461    }
462
463    // 9. BakerConfig defaults
464    #[test]
465    fn test_baker_config_defaults() {
466        let cfg = BakerConfig::default();
467        assert!((cfg.fps - 30.0).abs() < 1e-5);
468        assert!((cfg.blend_window - 0.05).abs() < 1e-5);
469        assert!((cfg.emphasis_scale - 1.0).abs() < 1e-5);
470        assert_eq!(cfg.silence_phoneme, "SIL");
471    }
472
473    // 10. emphasis_scale applies to baked weights
474    #[test]
475    fn test_emphasis_scale_applies() {
476        let events = vec![ev("AA", 0.0, 1.0)];
477        let vm = build_default_viseme_map();
478        let cfg_normal = BakerConfig::default();
479        let cfg_half = BakerConfig {
480            emphasis_scale: 0.5,
481            ..Default::default()
482        };
483
484        let baked_normal = bake_phoneme_sequence(&events, &vm, &cfg_normal);
485        let baked_half = bake_phoneme_sequence(&events, &vm, &cfg_half);
486
487        let frame_idx = 5; // somewhere in the middle
488        let mouth_open_normal = baked_normal.frames[frame_idx]
489            .get("mouth_open")
490            .copied()
491            .unwrap_or(0.0);
492        let mouth_open_half = baked_half.frames[frame_idx]
493            .get("mouth_open")
494            .copied()
495            .unwrap_or(0.0);
496        // With emphasis 0.5, mouth_open should be approximately half.
497        if mouth_open_normal > 0.01 {
498            assert!(
499                mouth_open_half < mouth_open_normal,
500                "half scale should be smaller"
501            );
502        }
503    }
504
505    // 11. blend_viseme_weights: two contributions average correctly
506    #[test]
507    fn test_blend_viseme_weights_two_contributions() {
508        let vm = build_default_viseme_map();
509        // SIL = all zeros, AA has mouth_open=0.9 → blend at 0.5 each → mouth_open=0.45
510        let contributions = vec![("SIL".to_string(), 0.5_f32), ("AA".to_string(), 0.5_f32)];
511        let weights = blend_viseme_weights(&contributions, &vm);
512        let expected = 0.9 * 0.5;
513        assert!((weights["mouth_open"] - expected).abs() < 1e-5);
514    }
515
516    // 12. baked sequence: BakedLipSync fps matches config
517    #[test]
518    fn test_baked_fps_matches_config() {
519        let events = vec![ev("IY", 0.0, 0.5)];
520        let vm = build_default_viseme_map();
521        let cfg = BakerConfig {
522            fps: 24.0,
523            ..Default::default()
524        };
525        let baked = bake_phoneme_sequence(&events, &vm, &cfg);
526        assert!((baked.fps - 24.0).abs() < 1e-5);
527    }
528}