use std::collections::HashMap;
#[allow(dead_code)]
pub struct PhonemeEvent {
pub phoneme: String,
pub start: f32,
pub end: f32,
}
#[allow(dead_code)]
pub struct BakerConfig {
pub fps: f32,
pub blend_window: f32,
pub emphasis_scale: f32,
pub silence_phoneme: String,
}
impl Default for BakerConfig {
fn default() -> Self {
Self {
fps: 30.0,
blend_window: 0.05,
emphasis_scale: 1.0,
silence_phoneme: "SIL".to_string(),
}
}
}
#[allow(dead_code)]
pub struct BakedLipSync {
pub fps: f32,
pub frames: Vec<HashMap<String, f32>>,
pub duration: f32,
}
#[allow(dead_code)]
pub fn build_default_viseme_map() -> HashMap<String, HashMap<String, f32>> {
let mut m: HashMap<String, HashMap<String, f32>> = HashMap::new();
m.insert(
"SIL".into(),
[
("mouth_open".into(), 0.0),
("lip_round".into(), 0.0),
("lip_wide".into(), 0.0),
("teeth_show".into(), 0.0),
("jaw_drop".into(), 0.0),
]
.into_iter()
.collect(),
);
m.insert(
"AA".into(),
[
("mouth_open".into(), 0.9),
("lip_round".into(), 0.0),
("lip_wide".into(), 0.4),
("teeth_show".into(), 0.6),
("jaw_drop".into(), 0.8),
]
.into_iter()
.collect(),
);
m.insert(
"AE".into(),
[
("mouth_open".into(), 0.6),
("lip_round".into(), 0.0),
("lip_wide".into(), 0.5),
("teeth_show".into(), 0.4),
("jaw_drop".into(), 0.5),
]
.into_iter()
.collect(),
);
m.insert(
"IY".into(),
[
("mouth_open".into(), 0.2),
("lip_round".into(), 0.0),
("lip_wide".into(), 0.9),
("teeth_show".into(), 0.5),
("jaw_drop".into(), 0.1),
]
.into_iter()
.collect(),
);
m.insert(
"UW".into(),
[
("mouth_open".into(), 0.3),
("lip_round".into(), 0.9),
("lip_wide".into(), 0.0),
("teeth_show".into(), 0.0),
("jaw_drop".into(), 0.2),
]
.into_iter()
.collect(),
);
m.insert(
"OW".into(),
[
("mouth_open".into(), 0.5),
("lip_round".into(), 0.7),
("lip_wide".into(), 0.0),
("teeth_show".into(), 0.1),
("jaw_drop".into(), 0.4),
]
.into_iter()
.collect(),
);
for ph in &["B", "P", "M"] {
m.insert(
ph.to_string(),
[
("mouth_open".into(), 0.0),
("lip_round".into(), 0.0),
("lip_wide".into(), 0.0),
("teeth_show".into(), 0.0),
("jaw_drop".into(), 0.0),
]
.into_iter()
.collect(),
);
}
for ph in &["F", "V"] {
m.insert(
ph.to_string(),
[
("mouth_open".into(), 0.1),
("lip_round".into(), 0.0),
("lip_wide".into(), 0.3),
("teeth_show".into(), 0.8),
("jaw_drop".into(), 0.1),
]
.into_iter()
.collect(),
);
}
for ph in &["TH", "DH"] {
m.insert(
ph.to_string(),
[
("mouth_open".into(), 0.15),
("lip_round".into(), 0.0),
("lip_wide".into(), 0.2),
("teeth_show".into(), 0.7),
("jaw_drop".into(), 0.1),
]
.into_iter()
.collect(),
);
}
for ph in &["S", "Z"] {
m.insert(
ph.to_string(),
[
("mouth_open".into(), 0.05),
("lip_round".into(), 0.0),
("lip_wide".into(), 0.4),
("teeth_show".into(), 0.6),
("jaw_drop".into(), 0.05),
]
.into_iter()
.collect(),
);
}
for ph in &["CH", "JH", "SH", "ZH"] {
m.insert(
ph.to_string(),
[
("mouth_open".into(), 0.2),
("lip_round".into(), 0.4),
("lip_wide".into(), 0.1),
("teeth_show".into(), 0.3),
("jaw_drop".into(), 0.15),
]
.into_iter()
.collect(),
);
}
m.insert(
"R".into(),
[
("mouth_open".into(), 0.2),
("lip_round".into(), 0.3),
("lip_wide".into(), 0.0),
("teeth_show".into(), 0.1),
("jaw_drop".into(), 0.2),
]
.into_iter()
.collect(),
);
for ph in &["L", "N", "D", "T"] {
m.insert(
ph.to_string(),
[
("mouth_open".into(), 0.3),
("lip_round".into(), 0.0),
("lip_wide".into(), 0.3),
("teeth_show".into(), 0.3),
("jaw_drop".into(), 0.2),
]
.into_iter()
.collect(),
);
}
m
}
#[allow(dead_code)]
pub fn active_phonemes_at(
events: &[PhonemeEvent],
t: f32,
blend_window: f32,
) -> Vec<(String, f32)> {
let mut contributions: Vec<(String, f32)> = Vec::new();
for ev in events {
if t < ev.start - blend_window || t > ev.end + blend_window {
continue;
}
let weight = if t < ev.start {
let d = ev.start - t;
1.0 - (d / blend_window).clamp(0.0, 1.0)
} else if t > ev.end {
let d = t - ev.end;
1.0 - (d / blend_window).clamp(0.0, 1.0)
} else {
1.0
};
if weight > 0.0 {
contributions.push((ev.phoneme.clone(), weight));
}
}
let total: f32 = contributions.iter().map(|(_, w)| w).sum();
if total > 1.0 {
for (_, w) in &mut contributions {
*w /= total;
}
}
contributions
}
#[allow(dead_code)]
pub fn blend_viseme_weights(
contributions: &[(String, f32)],
viseme_map: &HashMap<String, HashMap<String, f32>>,
) -> HashMap<String, f32> {
let mut result: HashMap<String, f32> = HashMap::new();
for (phoneme, weight) in contributions {
if let Some(morphs) = viseme_map.get(phoneme) {
for (morph, &v) in morphs {
*result.entry(morph.clone()).or_insert(0.0) += v * weight;
}
}
}
result
}
#[allow(dead_code)]
pub fn bake_phoneme_sequence(
events: &[PhonemeEvent],
viseme_map: &HashMap<String, HashMap<String, f32>>,
cfg: &BakerConfig,
) -> BakedLipSync {
let duration = events.iter().map(|e| e.end).fold(0.0_f32, f32::max);
let frame_count = (duration * cfg.fps).ceil() as usize + 1;
let silence_map: HashMap<String, f32> = viseme_map
.get(&cfg.silence_phoneme)
.cloned()
.unwrap_or_default();
let frames: Vec<HashMap<String, f32>> = (0..frame_count)
.map(|i| {
let t = (i as f32) / cfg.fps;
let contributions = active_phonemes_at(events, t, cfg.blend_window);
let mut weights = if contributions.is_empty() {
silence_map.clone()
} else {
blend_viseme_weights(&contributions, viseme_map)
};
if (cfg.emphasis_scale - 1.0).abs() > f32::EPSILON {
for v in weights.values_mut() {
*v = (*v * cfg.emphasis_scale).clamp(0.0, 1.0);
}
}
weights
})
.collect();
BakedLipSync {
fps: cfg.fps,
frames,
duration,
}
}
#[cfg(test)]
mod tests {
use super::*;
fn ev(phoneme: &str, start: f32, end: f32) -> PhonemeEvent {
PhonemeEvent {
phoneme: phoneme.to_string(),
start,
end,
}
}
#[test]
fn test_active_in_middle_full_weight() {
let events = vec![ev("AA", 0.0, 1.0)];
let result = active_phonemes_at(&events, 0.5, 0.05);
assert_eq!(result.len(), 1);
assert_eq!(result[0].0, "AA");
assert!((result[0].1 - 1.0).abs() < 1e-5);
}
#[test]
fn test_active_in_crossfade_both_present() {
let events = vec![ev("AA", 0.0, 1.0), ev("IY", 1.0, 2.0)];
let result = active_phonemes_at(&events, 0.97, 0.05);
let has_iy = result.iter().any(|(p, _)| p == "IY");
let has_aa = result.iter().any(|(p, _)| p == "AA");
assert!(has_aa, "AA should be active at t=0.97");
assert!(has_iy, "IY should be in fade-in at t=0.97");
}
#[test]
fn test_active_before_first_event_empty() {
let events = vec![ev("AA", 1.0, 2.0)];
let result = active_phonemes_at(&events, 0.0, 0.05);
assert!(result.is_empty());
}
#[test]
fn test_active_after_last_event_empty() {
let events = vec![ev("AA", 0.0, 1.0)];
let result = active_phonemes_at(&events, 1.2, 0.05);
assert!(result.is_empty());
}
#[test]
fn test_blend_viseme_weights_sum() {
let vm = build_default_viseme_map();
let contributions = vec![("AA".to_string(), 1.0_f32)];
let weights = blend_viseme_weights(&contributions, &vm);
let aa = vm.get("AA").expect("should succeed");
for (k, &v) in aa {
assert!((weights[k] - v).abs() < 1e-5, "key {} mismatch", k);
}
}
#[test]
fn test_bake_frame_count() {
let events = vec![ev("AA", 0.0, 1.0)];
let vm = build_default_viseme_map();
let cfg = BakerConfig::default();
let baked = bake_phoneme_sequence(&events, &vm, &cfg);
let expected = (1.0_f32 * 30.0).ceil() as usize + 1;
assert_eq!(baked.frames.len(), expected);
}
#[test]
fn test_baked_frame_has_morph_keys() {
let events = vec![ev("AA", 0.0, 1.0)];
let vm = build_default_viseme_map();
let cfg = BakerConfig::default();
let baked = bake_phoneme_sequence(&events, &vm, &cfg);
let frame = &baked.frames[0];
assert!(frame.contains_key("mouth_open"));
assert!(frame.contains_key("lip_round"));
}
#[test]
fn test_default_viseme_map_contains_sil() {
let vm = build_default_viseme_map();
assert!(vm.contains_key("SIL"));
}
#[test]
fn test_baker_config_defaults() {
let cfg = BakerConfig::default();
assert!((cfg.fps - 30.0).abs() < 1e-5);
assert!((cfg.blend_window - 0.05).abs() < 1e-5);
assert!((cfg.emphasis_scale - 1.0).abs() < 1e-5);
assert_eq!(cfg.silence_phoneme, "SIL");
}
#[test]
fn test_emphasis_scale_applies() {
let events = vec![ev("AA", 0.0, 1.0)];
let vm = build_default_viseme_map();
let cfg_normal = BakerConfig::default();
let cfg_half = BakerConfig {
emphasis_scale: 0.5,
..Default::default()
};
let baked_normal = bake_phoneme_sequence(&events, &vm, &cfg_normal);
let baked_half = bake_phoneme_sequence(&events, &vm, &cfg_half);
let frame_idx = 5; let mouth_open_normal = baked_normal.frames[frame_idx]
.get("mouth_open")
.copied()
.unwrap_or(0.0);
let mouth_open_half = baked_half.frames[frame_idx]
.get("mouth_open")
.copied()
.unwrap_or(0.0);
if mouth_open_normal > 0.01 {
assert!(
mouth_open_half < mouth_open_normal,
"half scale should be smaller"
);
}
}
#[test]
fn test_blend_viseme_weights_two_contributions() {
let vm = build_default_viseme_map();
let contributions = vec![("SIL".to_string(), 0.5_f32), ("AA".to_string(), 0.5_f32)];
let weights = blend_viseme_weights(&contributions, &vm);
let expected = 0.9 * 0.5;
assert!((weights["mouth_open"] - expected).abs() < 1e-5);
}
#[test]
fn test_baked_fps_matches_config() {
let events = vec![ev("IY", 0.0, 0.5)];
let vm = build_default_viseme_map();
let cfg = BakerConfig {
fps: 24.0,
..Default::default()
};
let baked = bake_phoneme_sequence(&events, &vm, &cfg);
assert!((baked.fps - 24.0).abs() < 1e-5);
}
}