1#[allow(dead_code)]
4#[derive(Clone)]
5pub struct PhonemeEvent {
6 pub phoneme: String,
7 pub start_time: f32,
8 pub duration: f32,
9 pub intensity: f32,
10}
11
12#[allow(dead_code)]
13pub struct CoarticulationParams {
14 pub lookahead: f32,
16 pub lookbehind: f32,
18 pub smoothing: f32,
20}
21
22#[allow(dead_code)]
23pub struct LipSyncTrack {
24 pub events: Vec<PhonemeEvent>,
25 pub duration: f32,
26 pub coarticulation: CoarticulationParams,
27}
28
29#[allow(dead_code)]
30pub struct LipSyncFrame {
31 pub time: f32,
32 pub active_phoneme: String,
33 pub blend_phoneme: Option<String>,
34 pub blend_weight: f32,
35 pub mouth_open: f32,
36 pub lip_corner_pull: f32,
37 pub lip_press: f32,
38}
39
40#[allow(dead_code)]
45pub fn default_coarticulation() -> CoarticulationParams {
46 CoarticulationParams {
47 lookahead: 0.05,
48 lookbehind: 0.03,
49 smoothing: 0.3,
50 }
51}
52
53#[allow(dead_code)]
54pub fn new_lip_sync_track(duration: f32) -> LipSyncTrack {
55 LipSyncTrack {
56 events: Vec::new(),
57 duration,
58 coarticulation: default_coarticulation(),
59 }
60}
61
62#[allow(dead_code)]
63pub fn add_phoneme_event(track: &mut LipSyncTrack, event: PhonemeEvent) {
64 track.events.push(event);
65}
66
67#[allow(dead_code)]
72pub fn sort_phoneme_events(track: &mut LipSyncTrack) {
73 track.events.sort_by(|a, b| {
74 a.start_time
75 .partial_cmp(&b.start_time)
76 .unwrap_or(std::cmp::Ordering::Equal)
77 });
78}
79
80#[allow(dead_code)]
81pub fn event_count(track: &LipSyncTrack) -> usize {
82 track.events.len()
83}
84
85#[allow(dead_code)]
86pub fn phonemes_at_time(track: &LipSyncTrack, time: f32) -> Vec<&PhonemeEvent> {
87 track
88 .events
89 .iter()
90 .filter(|e| time >= e.start_time && time < e.start_time + e.duration)
91 .collect()
92}
93
94#[allow(dead_code)]
100pub fn phoneme_to_mouth_shape(phoneme: &str) -> (f32, f32, f32) {
101 match phoneme.to_uppercase().as_str() {
102 "AA" | "AH" => (0.8, 0.1, 0.0),
104 "AE" => (0.7, 0.3, 0.0),
105 "AO" => (0.6, 0.0, 0.0),
106 "AW" => (0.5, 0.0, 0.1),
107 "AY" => (0.7, 0.2, 0.0),
108 "EH" => (0.5, 0.4, 0.0),
109 "ER" => (0.4, 0.1, 0.1),
110 "EY" => (0.4, 0.5, 0.0),
111 "IH" | "IY" => (0.2, 0.6, 0.0),
112 "OW" => (0.5, 0.0, 0.2),
113 "OY" => (0.5, 0.0, 0.3),
114 "UH" | "UW" => (0.3, 0.0, 0.4),
115 "B" | "P" | "M" => (0.0, 0.0, 0.8),
117 "F" | "V" => (0.1, 0.0, 0.6),
119 "TH" | "DH" => (0.2, 0.0, 0.2),
121 "S" | "Z" => (0.1, 0.3, 0.3),
122 "SH" | "ZH" => (0.2, 0.1, 0.4),
123 "SIL" | "" => (0.0, 0.0, 0.0),
125 _ => (0.3, 0.1, 0.1),
126 }
127}
128
129#[allow(dead_code)]
134pub fn evaluate_lip_sync(track: &LipSyncTrack, time: f32) -> LipSyncFrame {
135 let active = track
137 .events
138 .iter()
139 .rfind(|e| time >= e.start_time && time < e.start_time + e.duration);
140
141 let next = track
143 .events
144 .iter()
145 .filter(|e| e.start_time > time && e.start_time - time <= track.coarticulation.lookahead)
146 .min_by(|a, b| {
147 a.start_time
148 .partial_cmp(&b.start_time)
149 .unwrap_or(std::cmp::Ordering::Equal)
150 });
151
152 let (active_phoneme, base_open, base_corner, base_press) = if let Some(ev) = active {
153 let (o, c, p) = phoneme_to_mouth_shape(&ev.phoneme);
154 (
155 ev.phoneme.clone(),
156 o * ev.intensity,
157 c * ev.intensity,
158 p * ev.intensity,
159 )
160 } else {
161 (String::new(), 0.0, 0.0, 0.0)
162 };
163
164 let (blend_phoneme, blend_weight, mouth_open, lip_corner_pull, lip_press) =
165 if let Some(nev) = next {
166 let dist = nev.start_time - time;
167 let weight = (1.0 - dist / track.coarticulation.lookahead).clamp(0.0, 1.0);
168 let (no, nc, np) = phoneme_to_mouth_shape(&nev.phoneme);
169 let weight_scaled = weight * nev.intensity;
170 let w_inv = 1.0 - weight_scaled;
171 (
172 Some(nev.phoneme.clone()),
173 weight,
174 base_open * w_inv + no * weight_scaled,
175 base_corner * w_inv + nc * weight_scaled,
176 base_press * w_inv + np * weight_scaled,
177 )
178 } else {
179 (None, 0.0, base_open, base_corner, base_press)
180 };
181
182 LipSyncFrame {
183 time,
184 active_phoneme,
185 blend_phoneme,
186 blend_weight,
187 mouth_open,
188 lip_corner_pull,
189 lip_press,
190 }
191}
192
193#[allow(dead_code)]
198pub fn lip_sync_to_viseme_weights(track: &LipSyncTrack, time: f32) -> Vec<(String, f32)> {
199 let frame = evaluate_lip_sync(track, time);
200 let mut weights: Vec<(String, f32)> = Vec::new();
201 if !frame.active_phoneme.is_empty() {
202 let w = 1.0 - frame.blend_weight;
203 if w > 0.001 {
204 weights.push((frame.active_phoneme.clone(), w));
205 }
206 }
207 if let Some(blend) = frame.blend_phoneme {
208 if frame.blend_weight > 0.001 {
209 weights.push((blend, frame.blend_weight));
210 }
211 }
212 weights
213}
214
215#[allow(dead_code)]
220pub fn trim_lip_sync(track: &mut LipSyncTrack, start: f32, end: f32) {
221 track.events.retain(|e| {
222 let ev_end = e.start_time + e.duration;
223 ev_end > start && e.start_time < end
224 });
225 track.duration = end - start;
226}
227
228#[allow(dead_code)]
229pub fn scale_lip_sync_timing(track: &mut LipSyncTrack, factor: f32) {
230 for event in track.events.iter_mut() {
231 event.start_time *= factor;
232 event.duration *= factor;
233 }
234 track.duration *= factor;
235}
236
237#[allow(dead_code)]
238pub fn merge_lip_sync_tracks(a: &LipSyncTrack, b: &LipSyncTrack) -> LipSyncTrack {
239 let mut merged = new_lip_sync_track(a.duration.max(b.duration));
240 for ev in &a.events {
241 merged.events.push(ev.clone());
242 }
243 for ev in &b.events {
244 merged.events.push(ev.clone());
245 }
246 sort_phoneme_events(&mut merged);
247 merged
248}
249
250#[cfg(test)]
255mod tests {
256 use super::*;
257
258 fn make_event(phoneme: &str, start: f32, dur: f32) -> PhonemeEvent {
259 PhonemeEvent {
260 phoneme: phoneme.to_string(),
261 start_time: start,
262 duration: dur,
263 intensity: 1.0,
264 }
265 }
266
267 #[test]
268 fn test_new_track() {
269 let track = new_lip_sync_track(5.0);
270 assert!((track.duration - 5.0).abs() < 1e-6);
271 assert!(track.events.is_empty());
272 }
273
274 #[test]
275 fn test_add_event() {
276 let mut track = new_lip_sync_track(3.0);
277 add_phoneme_event(&mut track, make_event("AA", 0.0, 0.2));
278 assert_eq!(track.events.len(), 1);
279 }
280
281 #[test]
282 fn test_event_count() {
283 let mut track = new_lip_sync_track(3.0);
284 assert_eq!(event_count(&track), 0);
285 add_phoneme_event(&mut track, make_event("AA", 0.0, 0.2));
286 add_phoneme_event(&mut track, make_event("B", 0.2, 0.1));
287 assert_eq!(event_count(&track), 2);
288 }
289
290 #[test]
291 fn test_evaluate_lip_sync_active() {
292 let mut track = new_lip_sync_track(2.0);
293 add_phoneme_event(&mut track, make_event("AA", 0.0, 0.5));
294 let frame = evaluate_lip_sync(&track, 0.2);
295 assert_eq!(frame.active_phoneme, "AA");
296 assert!(frame.mouth_open > 0.0);
297 }
298
299 #[test]
300 fn test_evaluate_lip_sync_silence() {
301 let track = new_lip_sync_track(2.0);
302 let frame = evaluate_lip_sync(&track, 0.5);
303 assert_eq!(frame.active_phoneme, "");
304 assert!((frame.mouth_open).abs() < 1e-6);
305 }
306
307 #[test]
308 fn test_phoneme_to_mouth_shape_vowels() {
309 let (o, _c, _p) = phoneme_to_mouth_shape("AA");
310 assert!(o > 0.5, "AA should have large mouth open");
311 let (o2, c2, _) = phoneme_to_mouth_shape("IY");
312 assert!(o2 < 0.4, "IY should have smaller opening");
313 assert!(c2 > 0.4, "IY should pull corners");
314 }
315
316 #[test]
317 fn test_phoneme_to_mouth_shape_bilabial() {
318 let (o, _c, p) = phoneme_to_mouth_shape("B");
319 assert!((o).abs() < 1e-6, "B should close mouth");
320 assert!(p > 0.5, "B should press lips");
321 }
322
323 #[test]
324 fn test_phoneme_to_mouth_shape_silence() {
325 let (o, c, p) = phoneme_to_mouth_shape("SIL");
326 assert!((o + c + p).abs() < 1e-6);
327 }
328
329 #[test]
330 fn test_phonemes_at_time() {
331 let mut track = new_lip_sync_track(3.0);
332 add_phoneme_event(&mut track, make_event("AA", 0.0, 0.5));
333 add_phoneme_event(&mut track, make_event("B", 0.6, 0.3));
334 let at_01 = phonemes_at_time(&track, 0.1);
335 assert_eq!(at_01.len(), 1);
336 assert_eq!(at_01[0].phoneme, "AA");
337 let at_05 = phonemes_at_time(&track, 0.55);
338 assert!(at_05.is_empty());
339 }
340
341 #[test]
342 fn test_sort_phoneme_events() {
343 let mut track = new_lip_sync_track(3.0);
344 add_phoneme_event(&mut track, make_event("B", 0.5, 0.2));
345 add_phoneme_event(&mut track, make_event("AA", 0.0, 0.4));
346 sort_phoneme_events(&mut track);
347 assert!((track.events[0].start_time - 0.0).abs() < 1e-6);
348 assert!((track.events[1].start_time - 0.5).abs() < 1e-6);
349 }
350
351 #[test]
352 fn test_trim_lip_sync() {
353 let mut track = new_lip_sync_track(5.0);
354 add_phoneme_event(&mut track, make_event("AA", 0.0, 0.5));
355 add_phoneme_event(&mut track, make_event("B", 1.0, 0.3));
356 add_phoneme_event(&mut track, make_event("IY", 3.0, 0.5));
357 trim_lip_sync(&mut track, 0.5, 2.0);
358 assert_eq!(event_count(&track), 1);
360 assert_eq!(track.events[0].phoneme, "B");
361 }
362
363 #[test]
364 fn test_scale_lip_sync_timing() {
365 let mut track = new_lip_sync_track(2.0);
366 add_phoneme_event(&mut track, make_event("AA", 0.0, 0.5));
367 add_phoneme_event(&mut track, make_event("B", 0.5, 0.5));
368 scale_lip_sync_timing(&mut track, 2.0);
369 assert!((track.duration - 4.0).abs() < 1e-6);
370 assert!((track.events[0].duration - 1.0).abs() < 1e-6);
371 assert!((track.events[1].start_time - 1.0).abs() < 1e-6);
372 }
373
374 #[test]
375 fn test_merge_lip_sync_tracks() {
376 let mut a = new_lip_sync_track(1.0);
377 add_phoneme_event(&mut a, make_event("AA", 0.0, 0.5));
378 let mut b = new_lip_sync_track(2.0);
379 add_phoneme_event(&mut b, make_event("B", 1.0, 0.5));
380 add_phoneme_event(&mut b, make_event("IY", 1.5, 0.5));
381 let merged = merge_lip_sync_tracks(&a, &b);
382 assert_eq!(event_count(&merged), 3);
383 assert!((merged.duration - 2.0).abs() < 1e-6);
384 }
385
386 #[test]
387 fn test_viseme_weights_empty() {
388 let track = new_lip_sync_track(1.0);
389 let weights = lip_sync_to_viseme_weights(&track, 0.5);
390 assert!(weights.is_empty());
391 }
392
393 #[test]
394 fn test_default_coarticulation() {
395 let p = default_coarticulation();
396 assert!(p.lookahead > 0.0);
397 assert!(p.lookbehind >= 0.0);
398 assert!(p.smoothing >= 0.0 && p.smoothing <= 1.0);
399 }
400
401 #[test]
402 fn test_viseme_weights_active() {
403 let mut track = new_lip_sync_track(2.0);
404 add_phoneme_event(&mut track, make_event("AA", 0.0, 1.0));
405 let weights = lip_sync_to_viseme_weights(&track, 0.3);
407 assert!(!weights.is_empty());
408 assert_eq!(weights[0].0, "AA");
409 }
410}