use alloc::string::ToString;
use alloc::vec::Vec;
use serde::{Deserialize, Serialize};
use tracing::trace;
use crate::error::Result;
use crate::phoneme::{self, Phoneme};
use crate::prosody::Stress;
use crate::voice::VoiceProfile;
use crate::phoneme::PhonemeClass;
const MIN_CROSSFADE_FRACTION: f32 = 0.15;
const MAX_CROSSFADE_FRACTION: f32 = 0.45;
const CLUSTER_COMPRESSION: f32 = 0.7;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PhonemeEvent {
pub phoneme: Phoneme,
pub duration: f32,
pub stress: Stress,
#[serde(default)]
pub tone: Option<crate::prosody::Tone>,
}
impl PhonemeEvent {
#[must_use]
pub fn new(phoneme: Phoneme, duration: f32, stress: Stress) -> Self {
Self {
phoneme,
duration,
stress,
tone: None,
}
}
#[must_use]
pub fn with_tone(
phoneme: Phoneme,
duration: f32,
stress: Stress,
tone: crate::prosody::Tone,
) -> Self {
Self {
phoneme,
duration,
stress,
tone: Some(tone),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PhonemeSequence {
events: Vec<PhonemeEvent>,
transition_window: f32,
lookahead_onset: f32,
#[serde(default = "default_speaking_rate")]
speaking_rate: f32,
}
fn default_speaking_rate() -> f32 {
1.0
}
impl PhonemeSequence {
#[must_use]
pub fn new() -> Self {
Self {
events: Vec::new(),
transition_window: 0.05,
lookahead_onset: 0.6,
speaking_rate: 1.0,
}
}
pub fn set_transition_window(&mut self, window: f32) {
self.transition_window = window.max(0.001);
}
pub fn set_lookahead_onset(&mut self, onset: f32) {
self.lookahead_onset = onset.clamp(0.0, 1.0);
}
pub fn set_speaking_rate(&mut self, rate: f32) {
self.speaking_rate = rate.clamp(0.5, 3.0);
}
#[must_use]
pub fn speaking_rate(&self) -> f32 {
self.speaking_rate
}
#[must_use]
pub fn transition_window(&self) -> f32 {
self.transition_window
}
pub fn push(&mut self, event: PhonemeEvent) {
self.events.push(event);
}
#[must_use]
pub fn len(&self) -> usize {
self.events.len()
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.events.is_empty()
}
#[must_use]
pub fn events(&self) -> &[PhonemeEvent] {
&self.events
}
#[must_use]
pub fn total_duration(&self) -> f32 {
self.events.iter().map(|e| e.duration).sum()
}
pub fn render(&self, voice: &VoiceProfile, sample_rate: f32) -> Result<Vec<f32>> {
if self.events.is_empty() {
return Ok(Vec::new());
}
trace!(
num_events = self.events.len(),
sample_rate, "rendering phoneme sequence"
);
let in_cluster = detect_consonant_clusters(&self.events);
let durations: Vec<f32> = self
.events
.iter()
.enumerate()
.map(|(i, e)| {
let stress_scale = match e.stress {
Stress::Primary => 1.15,
Stress::Secondary => 1.05,
Stress::Unstressed => 0.9,
};
let cluster_scale = if in_cluster[i] {
CLUSTER_COMPRESSION
} else {
1.0
};
e.duration * stress_scale * cluster_scale
})
.collect();
let phoneme_list: Vec<Phoneme> = self.events.iter().map(|e| e.phoneme).collect();
let nasalizations = phoneme::detect_nasalization(&phoneme_list);
let mut segments: Vec<Vec<f32>> = Vec::with_capacity(self.events.len());
for (i, (event, &dur)) in self.events.iter().zip(durations.iter()).enumerate() {
let mut event_voice = voice.clone();
match event.stress {
Stress::Primary => {
event_voice.base_f0 *= 1.10;
}
Stress::Secondary => {
event_voice.base_f0 *= 1.05;
}
Stress::Unstressed => {}
}
let segment = phoneme::synthesize_phoneme_nasalized(
&event.phoneme,
&event_voice,
sample_rate,
dur,
nasalizations[i].as_ref(),
)?;
segments.push(segment);
}
let mut crossfade_lengths: Vec<usize> =
Vec::with_capacity(segments.len().saturating_sub(1));
for i in 0..segments.len().saturating_sub(1) {
let r_left = self.events[i].phoneme.coarticulation_resistance();
let r_right = self.events[i + 1].phoneme.coarticulation_resistance();
let avg_resistance = (r_left + r_right) * 0.5;
let frac = MAX_CROSSFADE_FRACTION
- avg_resistance * (MAX_CROSSFADE_FRACTION - MIN_CROSSFADE_FRACTION);
let shorter_len = segments[i].len().min(segments[i + 1].len());
let cf_len = (frac * shorter_len as f32) as usize;
let min_cf = (self.transition_window * sample_rate) as usize;
crossfade_lengths.push(cf_len.max(min_cf));
}
let output = crossfade_segments_variable(&segments, &crossfade_lengths);
Ok(output)
}
pub fn render_planned(&self, voice: &VoiceProfile, sample_rate: f32) -> Result<Vec<f32>> {
use crate::trajectory::TrajectoryPlanner;
if self.events.is_empty() {
return Ok(Vec::new());
}
let in_cluster = detect_consonant_clusters(&self.events);
let durations: Vec<f32> = self
.events
.iter()
.enumerate()
.map(|(i, e)| {
let stress_scale = match e.stress {
Stress::Primary => 1.15,
Stress::Secondary => 1.05,
Stress::Unstressed => 0.9,
};
let cluster_scale = if in_cluster[i] {
CLUSTER_COMPRESSION
} else {
1.0
};
e.duration * stress_scale * cluster_scale
})
.collect();
let phoneme_list: Vec<Phoneme> = self.events.iter().map(|e| e.phoneme).collect();
let nasalizations = phoneme::detect_nasalization(&phoneme_list);
let mut plan = TrajectoryPlanner::plan(&phoneme_list, &durations, voice, sample_rate);
plan.apply_speaking_rate(self.speaking_rate);
let total_samples = plan.total_samples();
if total_samples == 0 {
return Ok(Vec::new());
}
let mut boundaries = Vec::with_capacity(self.events.len() + 1);
let mut offset = 0usize;
boundaries.push(0);
for &dur in &durations {
offset += (dur * sample_rate) as usize;
boundaries.push(offset);
}
let mut glottal = voice
.create_glottal_source(sample_rate)
.map_err(|e| crate::error::SvaraError::ArticulationFailed(e.to_string()))?;
let mut tract = crate::tract::VocalTract::new(sample_rate);
let mut noise = crate::rng::Rng::new(17);
let mut output = Vec::with_capacity(total_samples);
let mut current_phoneme_idx = 0;
for sample_idx in 0..total_samples {
while current_phoneme_idx + 1 < self.events.len()
&& sample_idx >= boundaries[current_phoneme_idx + 1]
{
current_phoneme_idx += 1;
}
let phoneme = &self.events[current_phoneme_idx].phoneme;
let target = plan.formants_at(sample_idx);
if let Some(ref nasal) = nasalizations[current_phoneme_idx] {
let nasal_onset = boundaries[current_phoneme_idx]
+ ((boundaries[current_phoneme_idx + 1] - boundaries[current_phoneme_idx])
as f32
* nasal.onset) as usize;
if sample_idx >= nasal_onset {
let nasal_len = boundaries[current_phoneme_idx + 1]
.saturating_sub(nasal_onset)
.max(1);
let t = (sample_idx - nasal_onset) as f32 / nasal_len as f32;
tract.set_nasal_coupling(
nasal.peak_coupling * hisab::calc::ease_in_out_smooth(t),
);
tract.set_nasal_place(nasal.place);
} else {
tract.set_nasal_coupling(0.0);
}
} else {
tract.set_nasal_coupling(0.0);
}
let _ = tract.set_formants_from_target(&target);
let event = &self.events[current_phoneme_idx];
let base_f0 = match event.stress {
Stress::Primary => voice.base_f0 * 1.10,
Stress::Secondary => voice.base_f0 * 1.05,
Stress::Unstressed => voice.base_f0,
};
let effective_f0 = if let Some(tone) = event.tone {
let contour = tone.to_contour();
let seg_start = boundaries[current_phoneme_idx];
let seg_len = (boundaries[current_phoneme_idx + 1] - seg_start).max(1);
let t = (sample_idx - seg_start) as f32 / seg_len as f32;
base_f0 * contour.f0_at(t)
} else {
base_f0
};
let _ = glottal.set_f0(effective_f0);
let sample = match phoneme.class() {
PhonemeClass::Vowel
| PhonemeClass::Diphthong
| PhonemeClass::Approximant
| PhonemeClass::Lateral
| PhonemeClass::Nasal
| PhonemeClass::Implosive
| PhonemeClass::Trill => tract.process_sample(glottal.next_sample()),
PhonemeClass::Fricative => {
let n = noise.next_f32() * 0.5;
if phoneme.is_voiced() {
n * 0.6 + tract.process_sample(glottal.next_sample()) * 0.4
} else {
n * 0.6
}
}
PhonemeClass::Plosive
| PhonemeClass::Affricate
| PhonemeClass::Click
| PhonemeClass::Ejective => {
let burst_frac = (sample_idx - boundaries[current_phoneme_idx]) as f32
/ (boundaries[current_phoneme_idx + 1] - boundaries[current_phoneme_idx])
.max(1) as f32;
if burst_frac < 0.4 {
0.0 } else {
noise.next_f32() * (1.0 - burst_frac) * 0.5
}
}
PhonemeClass::Silence => 0.0,
};
output.push(sample);
}
let len = output.len();
let ramp = (len / 20).clamp(1, 256);
for (i, s) in output.iter_mut().enumerate().take(ramp) {
*s *= hisab::calc::ease_in_out_smooth(i as f32 / ramp as f32);
}
for i in 0..ramp {
let idx = len - 1 - i;
output[idx] *= hisab::calc::ease_in_out_smooth(i as f32 / ramp as f32);
}
Ok(output)
}
}
impl Default for PhonemeSequence {
fn default() -> Self {
Self::new()
}
}
fn detect_consonant_clusters(events: &[PhonemeEvent]) -> Vec<bool> {
let n = events.len();
let mut in_cluster = alloc::vec![false; n];
let is_consonant = |p: &Phoneme| {
!matches!(
p.class(),
PhonemeClass::Vowel | PhonemeClass::Diphthong | PhonemeClass::Silence
)
};
let mut run_start = None;
for i in 0..=n {
let is_cons = i < n && is_consonant(&events[i].phoneme);
if is_cons && run_start.is_none() {
run_start = Some(i);
} else if !is_cons && let Some(start) = run_start {
let run_len = i - start;
if run_len >= 2 {
for flag in &mut in_cluster[start..i] {
*flag = true;
}
}
run_start = None;
}
}
in_cluster
}
#[inline]
fn sigmoid_fade(t: f32) -> f32 {
hisab::calc::ease_in_out_smooth(t.clamp(0.0, 1.0))
}
fn crossfade_segments_variable(segments: &[Vec<f32>], crossfade_lengths: &[usize]) -> Vec<f32> {
if segments.is_empty() {
return Vec::new();
}
if segments.len() == 1 {
return segments[0].clone();
}
let total_samples: usize = segments.iter().map(|s| s.len()).sum();
let overlap: usize = crossfade_lengths.iter().sum();
let estimated_len = total_samples.saturating_sub(overlap);
let mut output = Vec::with_capacity(estimated_len);
for (i, segment) in segments.iter().enumerate() {
let cf_next = if i < crossfade_lengths.len() {
crossfade_lengths[i]
} else {
0
};
let cf_prev = if i > 0 { crossfade_lengths[i - 1] } else { 0 };
if i == 0 {
if segment.len() > cf_next {
output.extend_from_slice(&segment[..segment.len() - cf_next]);
}
let fade_start = segment.len().saturating_sub(cf_next);
for (j, &sample) in segment[fade_start..].iter().enumerate() {
let t = j as f32 / cf_next.max(1) as f32;
output.push(sample * (1.0 - sigmoid_fade(t)));
}
} else {
let fade_len = cf_prev.min(segment.len());
let output_len = output.len();
for (j, &seg_sample) in segment.iter().enumerate().take(fade_len) {
let t = j as f32 / fade_len.max(1) as f32;
let idx = output_len - (fade_len - j);
if idx < output.len() {
output[idx] += seg_sample * sigmoid_fade(t);
}
}
if segment.len() > fade_len {
if i < segments.len() - 1 && segment.len() > fade_len + cf_next {
output.extend_from_slice(&segment[fade_len..segment.len() - cf_next]);
let fade_start = segment.len() - cf_next;
for (j, &sample) in segment[fade_start..].iter().enumerate() {
let t = j as f32 / cf_next.max(1) as f32;
output.push(sample * (1.0 - sigmoid_fade(t)));
}
} else {
output.extend_from_slice(&segment[fade_len..]);
}
}
}
}
output
}
#[cfg(test)]
mod tests {
use super::*;
use crate::phoneme::Phoneme;
use alloc::vec;
#[test]
fn test_empty_sequence() {
let seq = PhonemeSequence::new();
assert!(seq.is_empty());
assert_eq!(seq.len(), 0);
let voice = VoiceProfile::new_male();
let result = seq.render(&voice, 44100.0);
assert!(result.is_ok());
assert!(result.unwrap().is_empty());
}
#[test]
fn test_single_phoneme() {
let mut seq = PhonemeSequence::new();
seq.push(PhonemeEvent::new(Phoneme::VowelA, 0.1, Stress::Primary));
assert_eq!(seq.len(), 1);
let voice = VoiceProfile::new_male();
let result = seq.render(&voice, 44100.0);
assert!(result.is_ok());
let samples = result.unwrap();
assert!(!samples.is_empty());
assert!(samples.iter().all(|s| s.is_finite()));
}
#[test]
fn test_multi_phoneme() {
let mut seq = PhonemeSequence::new();
seq.push(PhonemeEvent::new(Phoneme::VowelA, 0.1, Stress::Primary));
seq.push(PhonemeEvent::new(Phoneme::NasalN, 0.06, Stress::Unstressed));
seq.push(PhonemeEvent::new(Phoneme::VowelI, 0.1, Stress::Secondary));
let voice = VoiceProfile::new_male();
let result = seq.render(&voice, 44100.0);
assert!(result.is_ok());
let samples = result.unwrap();
assert!(!samples.is_empty());
assert!(samples.iter().all(|s| s.is_finite()));
}
#[test]
fn test_total_duration() {
let mut seq = PhonemeSequence::new();
seq.push(PhonemeEvent::new(Phoneme::VowelA, 0.1, Stress::Unstressed));
seq.push(PhonemeEvent::new(Phoneme::VowelE, 0.2, Stress::Unstressed));
assert!((seq.total_duration() - 0.3).abs() < f32::EPSILON);
}
#[test]
fn test_crossfade_no_clicks() {
let mut seq = PhonemeSequence::new();
seq.push(PhonemeEvent::new(Phoneme::VowelA, 0.1, Stress::Primary));
seq.push(PhonemeEvent::new(Phoneme::VowelI, 0.1, Stress::Primary));
let voice = VoiceProfile::new_male();
let samples = seq.render(&voice, 44100.0).unwrap();
let max_jump = samples
.windows(2)
.map(|w| (w[1] - w[0]).abs())
.fold(0.0f32, f32::max);
let max_amp = samples.iter().map(|s| s.abs()).fold(0.0f32, f32::max);
if max_amp > 0.001 {
assert!(
max_jump < max_amp * 2.0,
"potential click detected: max_jump={max_jump}, max_amp={max_amp}"
);
}
}
#[test]
fn test_serde_roundtrip() {
let mut seq = PhonemeSequence::new();
seq.push(PhonemeEvent::new(Phoneme::VowelA, 0.1, Stress::Primary));
let json = serde_json::to_string(&seq).unwrap();
let seq2: PhonemeSequence = serde_json::from_str(&json).unwrap();
assert_eq!(seq2.len(), 1);
}
#[test]
fn test_cluster_detection_no_cluster() {
let events = vec![
PhonemeEvent::new(Phoneme::VowelA, 0.1, Stress::Primary),
PhonemeEvent::new(Phoneme::NasalN, 0.06, Stress::Unstressed),
PhonemeEvent::new(Phoneme::VowelI, 0.1, Stress::Primary),
];
let clusters = detect_consonant_clusters(&events);
assert!(!clusters[1]);
}
#[test]
fn test_cluster_detection_pair() {
let events = vec![
PhonemeEvent::new(Phoneme::VowelA, 0.1, Stress::Primary),
PhonemeEvent::new(Phoneme::FricativeS, 0.06, Stress::Unstressed),
PhonemeEvent::new(Phoneme::PlosiveT, 0.06, Stress::Unstressed),
PhonemeEvent::new(Phoneme::VowelI, 0.1, Stress::Primary),
];
let clusters = detect_consonant_clusters(&events);
assert!(!clusters[0]); assert!(clusters[1]); assert!(clusters[2]); assert!(!clusters[3]); }
#[test]
fn test_cluster_detection_triple() {
let events = vec![
PhonemeEvent::new(Phoneme::FricativeS, 0.06, Stress::Unstressed),
PhonemeEvent::new(Phoneme::PlosiveT, 0.06, Stress::Unstressed),
PhonemeEvent::new(Phoneme::ApproximantR, 0.06, Stress::Unstressed),
PhonemeEvent::new(Phoneme::VowelI, 0.1, Stress::Primary),
];
let clusters = detect_consonant_clusters(&events);
assert!(clusters[0]); assert!(clusters[1]); assert!(clusters[2]); assert!(!clusters[3]); }
#[test]
fn test_cluster_renders_shorter() {
let voice = VoiceProfile::new_male();
let mut seq_no = PhonemeSequence::new();
seq_no.push(PhonemeEvent::new(Phoneme::VowelA, 0.1, Stress::Unstressed));
seq_no.push(PhonemeEvent::new(
Phoneme::FricativeS,
0.08,
Stress::Unstressed,
));
seq_no.push(PhonemeEvent::new(Phoneme::VowelI, 0.1, Stress::Unstressed));
let out_no = seq_no.render(&voice, 44100.0).unwrap();
let mut seq_cl = PhonemeSequence::new();
seq_cl.push(PhonemeEvent::new(Phoneme::VowelA, 0.1, Stress::Unstressed));
seq_cl.push(PhonemeEvent::new(
Phoneme::FricativeS,
0.08,
Stress::Unstressed,
));
seq_cl.push(PhonemeEvent::new(
Phoneme::PlosiveT,
0.08,
Stress::Unstressed,
));
seq_cl.push(PhonemeEvent::new(Phoneme::VowelI, 0.1, Stress::Unstressed));
let out_cl = seq_cl.render(&voice, 44100.0).unwrap();
assert!(out_cl.iter().all(|s| s.is_finite()));
let naive_extra = (0.08 * 0.9 * 44100.0) as usize; assert!(
out_cl.len() < out_no.len() + naive_extra,
"cluster should be compressed: cluster={}, no_cluster={}, naive_extra={}",
out_cl.len(),
out_no.len(),
naive_extra
);
}
#[test]
fn test_render_planned_empty() {
let seq = PhonemeSequence::new();
let voice = VoiceProfile::new_male();
let result = seq.render_planned(&voice, 44100.0);
assert!(result.is_ok());
assert!(result.unwrap().is_empty());
}
#[test]
fn test_render_planned_single() {
let mut seq = PhonemeSequence::new();
seq.push(PhonemeEvent::new(Phoneme::VowelA, 0.1, Stress::Primary));
let voice = VoiceProfile::new_male();
let result = seq.render_planned(&voice, 44100.0);
assert!(result.is_ok());
let samples = result.unwrap();
assert!(!samples.is_empty());
assert!(samples.iter().all(|s| s.is_finite()));
}
#[test]
fn test_render_planned_multi() {
let mut seq = PhonemeSequence::new();
seq.push(PhonemeEvent::new(Phoneme::VowelA, 0.1, Stress::Primary));
seq.push(PhonemeEvent::new(Phoneme::NasalN, 0.06, Stress::Unstressed));
seq.push(PhonemeEvent::new(Phoneme::VowelI, 0.1, Stress::Secondary));
let voice = VoiceProfile::new_male();
let samples = seq.render_planned(&voice, 44100.0).unwrap();
assert!(!samples.is_empty());
assert!(samples.iter().all(|s| s.is_finite()));
}
#[test]
fn test_render_planned_with_consonants() {
let mut seq = PhonemeSequence::new();
seq.push(PhonemeEvent::new(
Phoneme::PlosiveP,
0.06,
Stress::Unstressed,
));
seq.push(PhonemeEvent::new(Phoneme::VowelA, 0.1, Stress::Primary));
seq.push(PhonemeEvent::new(
Phoneme::FricativeS,
0.08,
Stress::Unstressed,
));
seq.push(PhonemeEvent::new(Phoneme::VowelI, 0.1, Stress::Primary));
let voice = VoiceProfile::new_male();
let samples = seq.render_planned(&voice, 44100.0).unwrap();
assert!(samples.iter().all(|s| s.is_finite()));
}
#[test]
fn test_render_planned_produces_non_silent() {
let mut seq = PhonemeSequence::new();
seq.push(PhonemeEvent::new(Phoneme::VowelA, 0.1, Stress::Primary));
seq.push(PhonemeEvent::new(Phoneme::VowelI, 0.1, Stress::Primary));
let voice = VoiceProfile::new_male();
let samples = seq.render_planned(&voice, 44100.0).unwrap();
assert!(samples.iter().any(|&s| s.abs() > 1e-6));
}
#[test]
fn test_render_planned_with_tone() {
use crate::prosody::Tone;
let mut seq = PhonemeSequence::new();
seq.push(PhonemeEvent::with_tone(
Phoneme::VowelA,
0.15,
Stress::Unstressed,
Tone::Falling,
));
seq.push(PhonemeEvent::with_tone(
Phoneme::VowelI,
0.15,
Stress::Unstressed,
Tone::Rising,
));
let voice = VoiceProfile::new_male();
let samples = seq.render_planned(&voice, 44100.0).unwrap();
assert!(!samples.is_empty());
assert!(samples.iter().all(|s| s.is_finite()));
assert!(samples.iter().any(|&s| s.abs() > 1e-6));
}
}