Skip to main content

oxihuman_morph/
lip_sync_advanced.rs

1//! Advanced lip sync with phoneme scheduling and coarticulation.
2
3#[allow(dead_code)]
4#[derive(Clone)]
5pub struct PhonemeEvent {
6    pub phoneme: String,
7    pub start_time: f32,
8    pub duration: f32,
9    pub intensity: f32,
10}
11
12#[allow(dead_code)]
13pub struct CoarticulationParams {
14    /// How far ahead to blend toward next phoneme (secs).
15    pub lookahead: f32,
16    /// Blend tail from previous phoneme (secs).
17    pub lookbehind: f32,
18    /// Smoothing factor in 0..1.
19    pub smoothing: f32,
20}
21
22#[allow(dead_code)]
23pub struct LipSyncTrack {
24    pub events: Vec<PhonemeEvent>,
25    pub duration: f32,
26    pub coarticulation: CoarticulationParams,
27}
28
29#[allow(dead_code)]
30pub struct LipSyncFrame {
31    pub time: f32,
32    pub active_phoneme: String,
33    pub blend_phoneme: Option<String>,
34    pub blend_weight: f32,
35    pub mouth_open: f32,
36    pub lip_corner_pull: f32,
37    pub lip_press: f32,
38}
39
40// ---------------------------------------------------------------------------
41// Construction helpers
42// ---------------------------------------------------------------------------
43
44#[allow(dead_code)]
45pub fn default_coarticulation() -> CoarticulationParams {
46    CoarticulationParams {
47        lookahead: 0.05,
48        lookbehind: 0.03,
49        smoothing: 0.3,
50    }
51}
52
53#[allow(dead_code)]
54pub fn new_lip_sync_track(duration: f32) -> LipSyncTrack {
55    LipSyncTrack {
56        events: Vec::new(),
57        duration,
58        coarticulation: default_coarticulation(),
59    }
60}
61
62#[allow(dead_code)]
63pub fn add_phoneme_event(track: &mut LipSyncTrack, event: PhonemeEvent) {
64    track.events.push(event);
65}
66
67// ---------------------------------------------------------------------------
68// Sorting / indexing
69// ---------------------------------------------------------------------------
70
71#[allow(dead_code)]
72pub fn sort_phoneme_events(track: &mut LipSyncTrack) {
73    track.events.sort_by(|a, b| {
74        a.start_time
75            .partial_cmp(&b.start_time)
76            .unwrap_or(std::cmp::Ordering::Equal)
77    });
78}
79
80#[allow(dead_code)]
81pub fn event_count(track: &LipSyncTrack) -> usize {
82    track.events.len()
83}
84
85#[allow(dead_code)]
86pub fn phonemes_at_time(track: &LipSyncTrack, time: f32) -> Vec<&PhonemeEvent> {
87    track
88        .events
89        .iter()
90        .filter(|e| time >= e.start_time && time < e.start_time + e.duration)
91        .collect()
92}
93
94// ---------------------------------------------------------------------------
95// Phoneme → mouth shape
96// ---------------------------------------------------------------------------
97
98/// Returns `(mouth_open, lip_corner_pull, lip_press)` for a given phoneme.
99#[allow(dead_code)]
100pub fn phoneme_to_mouth_shape(phoneme: &str) -> (f32, f32, f32) {
101    match phoneme.to_uppercase().as_str() {
102        // Vowels
103        "AA" | "AH" => (0.8, 0.1, 0.0),
104        "AE" => (0.7, 0.3, 0.0),
105        "AO" => (0.6, 0.0, 0.0),
106        "AW" => (0.5, 0.0, 0.1),
107        "AY" => (0.7, 0.2, 0.0),
108        "EH" => (0.5, 0.4, 0.0),
109        "ER" => (0.4, 0.1, 0.1),
110        "EY" => (0.4, 0.5, 0.0),
111        "IH" | "IY" => (0.2, 0.6, 0.0),
112        "OW" => (0.5, 0.0, 0.2),
113        "OY" => (0.5, 0.0, 0.3),
114        "UH" | "UW" => (0.3, 0.0, 0.4),
115        // Bilabials
116        "B" | "P" | "M" => (0.0, 0.0, 0.8),
117        // Labiodentals
118        "F" | "V" => (0.1, 0.0, 0.6),
119        // Dentals / sibilants
120        "TH" | "DH" => (0.2, 0.0, 0.2),
121        "S" | "Z" => (0.1, 0.3, 0.3),
122        "SH" | "ZH" => (0.2, 0.1, 0.4),
123        // Rest / silence
124        "SIL" | "" => (0.0, 0.0, 0.0),
125        _ => (0.3, 0.1, 0.1),
126    }
127}
128
129// ---------------------------------------------------------------------------
130// Evaluation
131// ---------------------------------------------------------------------------
132
133#[allow(dead_code)]
134pub fn evaluate_lip_sync(track: &LipSyncTrack, time: f32) -> LipSyncFrame {
135    // Find the active phoneme event (last one whose window covers `time`).
136    let active = track
137        .events
138        .iter()
139        .rfind(|e| time >= e.start_time && time < e.start_time + e.duration);
140
141    // Find the next phoneme event (soonest starting after `time` within lookahead).
142    let next = track
143        .events
144        .iter()
145        .filter(|e| e.start_time > time && e.start_time - time <= track.coarticulation.lookahead)
146        .min_by(|a, b| {
147            a.start_time
148                .partial_cmp(&b.start_time)
149                .unwrap_or(std::cmp::Ordering::Equal)
150        });
151
152    let (active_phoneme, base_open, base_corner, base_press) = if let Some(ev) = active {
153        let (o, c, p) = phoneme_to_mouth_shape(&ev.phoneme);
154        (
155            ev.phoneme.clone(),
156            o * ev.intensity,
157            c * ev.intensity,
158            p * ev.intensity,
159        )
160    } else {
161        (String::new(), 0.0, 0.0, 0.0)
162    };
163
164    let (blend_phoneme, blend_weight, mouth_open, lip_corner_pull, lip_press) =
165        if let Some(nev) = next {
166            let dist = nev.start_time - time;
167            let weight = (1.0 - dist / track.coarticulation.lookahead).clamp(0.0, 1.0);
168            let (no, nc, np) = phoneme_to_mouth_shape(&nev.phoneme);
169            let weight_scaled = weight * nev.intensity;
170            let w_inv = 1.0 - weight_scaled;
171            (
172                Some(nev.phoneme.clone()),
173                weight,
174                base_open * w_inv + no * weight_scaled,
175                base_corner * w_inv + nc * weight_scaled,
176                base_press * w_inv + np * weight_scaled,
177            )
178        } else {
179            (None, 0.0, base_open, base_corner, base_press)
180        };
181
182    LipSyncFrame {
183        time,
184        active_phoneme,
185        blend_phoneme,
186        blend_weight,
187        mouth_open,
188        lip_corner_pull,
189        lip_press,
190    }
191}
192
193// ---------------------------------------------------------------------------
194// Viseme weights
195// ---------------------------------------------------------------------------
196
197#[allow(dead_code)]
198pub fn lip_sync_to_viseme_weights(track: &LipSyncTrack, time: f32) -> Vec<(String, f32)> {
199    let frame = evaluate_lip_sync(track, time);
200    let mut weights: Vec<(String, f32)> = Vec::new();
201    if !frame.active_phoneme.is_empty() {
202        let w = 1.0 - frame.blend_weight;
203        if w > 0.001 {
204            weights.push((frame.active_phoneme.clone(), w));
205        }
206    }
207    if let Some(blend) = frame.blend_phoneme {
208        if frame.blend_weight > 0.001 {
209            weights.push((blend, frame.blend_weight));
210        }
211    }
212    weights
213}
214
215// ---------------------------------------------------------------------------
216// Editing helpers
217// ---------------------------------------------------------------------------
218
219#[allow(dead_code)]
220pub fn trim_lip_sync(track: &mut LipSyncTrack, start: f32, end: f32) {
221    track.events.retain(|e| {
222        let ev_end = e.start_time + e.duration;
223        ev_end > start && e.start_time < end
224    });
225    track.duration = end - start;
226}
227
228#[allow(dead_code)]
229pub fn scale_lip_sync_timing(track: &mut LipSyncTrack, factor: f32) {
230    for event in track.events.iter_mut() {
231        event.start_time *= factor;
232        event.duration *= factor;
233    }
234    track.duration *= factor;
235}
236
237#[allow(dead_code)]
238pub fn merge_lip_sync_tracks(a: &LipSyncTrack, b: &LipSyncTrack) -> LipSyncTrack {
239    let mut merged = new_lip_sync_track(a.duration.max(b.duration));
240    for ev in &a.events {
241        merged.events.push(ev.clone());
242    }
243    for ev in &b.events {
244        merged.events.push(ev.clone());
245    }
246    sort_phoneme_events(&mut merged);
247    merged
248}
249
250// ---------------------------------------------------------------------------
251// Tests
252// ---------------------------------------------------------------------------
253
254#[cfg(test)]
255mod tests {
256    use super::*;
257
258    fn make_event(phoneme: &str, start: f32, dur: f32) -> PhonemeEvent {
259        PhonemeEvent {
260            phoneme: phoneme.to_string(),
261            start_time: start,
262            duration: dur,
263            intensity: 1.0,
264        }
265    }
266
267    #[test]
268    fn test_new_track() {
269        let track = new_lip_sync_track(5.0);
270        assert!((track.duration - 5.0).abs() < 1e-6);
271        assert!(track.events.is_empty());
272    }
273
274    #[test]
275    fn test_add_event() {
276        let mut track = new_lip_sync_track(3.0);
277        add_phoneme_event(&mut track, make_event("AA", 0.0, 0.2));
278        assert_eq!(track.events.len(), 1);
279    }
280
281    #[test]
282    fn test_event_count() {
283        let mut track = new_lip_sync_track(3.0);
284        assert_eq!(event_count(&track), 0);
285        add_phoneme_event(&mut track, make_event("AA", 0.0, 0.2));
286        add_phoneme_event(&mut track, make_event("B", 0.2, 0.1));
287        assert_eq!(event_count(&track), 2);
288    }
289
290    #[test]
291    fn test_evaluate_lip_sync_active() {
292        let mut track = new_lip_sync_track(2.0);
293        add_phoneme_event(&mut track, make_event("AA", 0.0, 0.5));
294        let frame = evaluate_lip_sync(&track, 0.2);
295        assert_eq!(frame.active_phoneme, "AA");
296        assert!(frame.mouth_open > 0.0);
297    }
298
299    #[test]
300    fn test_evaluate_lip_sync_silence() {
301        let track = new_lip_sync_track(2.0);
302        let frame = evaluate_lip_sync(&track, 0.5);
303        assert_eq!(frame.active_phoneme, "");
304        assert!((frame.mouth_open).abs() < 1e-6);
305    }
306
307    #[test]
308    fn test_phoneme_to_mouth_shape_vowels() {
309        let (o, _c, _p) = phoneme_to_mouth_shape("AA");
310        assert!(o > 0.5, "AA should have large mouth open");
311        let (o2, c2, _) = phoneme_to_mouth_shape("IY");
312        assert!(o2 < 0.4, "IY should have smaller opening");
313        assert!(c2 > 0.4, "IY should pull corners");
314    }
315
316    #[test]
317    fn test_phoneme_to_mouth_shape_bilabial() {
318        let (o, _c, p) = phoneme_to_mouth_shape("B");
319        assert!((o).abs() < 1e-6, "B should close mouth");
320        assert!(p > 0.5, "B should press lips");
321    }
322
323    #[test]
324    fn test_phoneme_to_mouth_shape_silence() {
325        let (o, c, p) = phoneme_to_mouth_shape("SIL");
326        assert!((o + c + p).abs() < 1e-6);
327    }
328
329    #[test]
330    fn test_phonemes_at_time() {
331        let mut track = new_lip_sync_track(3.0);
332        add_phoneme_event(&mut track, make_event("AA", 0.0, 0.5));
333        add_phoneme_event(&mut track, make_event("B", 0.6, 0.3));
334        let at_01 = phonemes_at_time(&track, 0.1);
335        assert_eq!(at_01.len(), 1);
336        assert_eq!(at_01[0].phoneme, "AA");
337        let at_05 = phonemes_at_time(&track, 0.55);
338        assert!(at_05.is_empty());
339    }
340
341    #[test]
342    fn test_sort_phoneme_events() {
343        let mut track = new_lip_sync_track(3.0);
344        add_phoneme_event(&mut track, make_event("B", 0.5, 0.2));
345        add_phoneme_event(&mut track, make_event("AA", 0.0, 0.4));
346        sort_phoneme_events(&mut track);
347        assert!((track.events[0].start_time - 0.0).abs() < 1e-6);
348        assert!((track.events[1].start_time - 0.5).abs() < 1e-6);
349    }
350
351    #[test]
352    fn test_trim_lip_sync() {
353        let mut track = new_lip_sync_track(5.0);
354        add_phoneme_event(&mut track, make_event("AA", 0.0, 0.5));
355        add_phoneme_event(&mut track, make_event("B", 1.0, 0.3));
356        add_phoneme_event(&mut track, make_event("IY", 3.0, 0.5));
357        trim_lip_sync(&mut track, 0.5, 2.0);
358        // AA ends at 0.5 so its ev_end == 0.5 which is NOT > start=0.5, should be removed
359        assert_eq!(event_count(&track), 1);
360        assert_eq!(track.events[0].phoneme, "B");
361    }
362
363    #[test]
364    fn test_scale_lip_sync_timing() {
365        let mut track = new_lip_sync_track(2.0);
366        add_phoneme_event(&mut track, make_event("AA", 0.0, 0.5));
367        add_phoneme_event(&mut track, make_event("B", 0.5, 0.5));
368        scale_lip_sync_timing(&mut track, 2.0);
369        assert!((track.duration - 4.0).abs() < 1e-6);
370        assert!((track.events[0].duration - 1.0).abs() < 1e-6);
371        assert!((track.events[1].start_time - 1.0).abs() < 1e-6);
372    }
373
374    #[test]
375    fn test_merge_lip_sync_tracks() {
376        let mut a = new_lip_sync_track(1.0);
377        add_phoneme_event(&mut a, make_event("AA", 0.0, 0.5));
378        let mut b = new_lip_sync_track(2.0);
379        add_phoneme_event(&mut b, make_event("B", 1.0, 0.5));
380        add_phoneme_event(&mut b, make_event("IY", 1.5, 0.5));
381        let merged = merge_lip_sync_tracks(&a, &b);
382        assert_eq!(event_count(&merged), 3);
383        assert!((merged.duration - 2.0).abs() < 1e-6);
384    }
385
386    #[test]
387    fn test_viseme_weights_empty() {
388        let track = new_lip_sync_track(1.0);
389        let weights = lip_sync_to_viseme_weights(&track, 0.5);
390        assert!(weights.is_empty());
391    }
392
393    #[test]
394    fn test_default_coarticulation() {
395        let p = default_coarticulation();
396        assert!(p.lookahead > 0.0);
397        assert!(p.lookbehind >= 0.0);
398        assert!(p.smoothing >= 0.0 && p.smoothing <= 1.0);
399    }
400
401    #[test]
402    fn test_viseme_weights_active() {
403        let mut track = new_lip_sync_track(2.0);
404        add_phoneme_event(&mut track, make_event("AA", 0.0, 1.0));
405        // No next event, so weight=1.0 for "AA"
406        let weights = lip_sync_to_viseme_weights(&track, 0.3);
407        assert!(!weights.is_empty());
408        assert_eq!(weights[0].0, "AA");
409    }
410}