use alloc::{format, string::ToString, vec, vec::Vec};
use serde::{Deserialize, Serialize};
use tracing::trace;
use crate::error::{Result, SvaraError};
use crate::formant::{Formant, FormantFilter, Vowel, VowelTarget};
use crate::tract::{NasalPlace, VocalTract};
use crate::voice::VoiceProfile;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[non_exhaustive]
pub enum Phoneme {
VowelA,
VowelE,
VowelI,
VowelO,
VowelU,
VowelSchwa,
VowelOpenO,
VowelAsh,
VowelNearI,
VowelNearU,
VowelOpenA,
VowelOpenE,
VowelCupV,
VowelBird,
VowelLongI,
DiphthongAI,
DiphthongAU,
DiphthongOI,
DiphthongEI,
DiphthongOU,
PlosiveP,
PlosiveB,
PlosiveT,
PlosiveD,
PlosiveK,
PlosiveG,
FricativeF,
FricativeV,
FricativeS,
FricativeZ,
FricativeSh,
FricativeZh,
FricativeTh,
FricativeDh,
FricativeH,
NasalM,
NasalN,
NasalNg,
AffricateCh,
AffricateJ,
GlottalStop,
TapFlap,
LateralL,
ApproximantR,
ApproximantW,
ApproximantJ,
Silence,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[non_exhaustive]
pub enum PhonemeClass {
Plosive,
Fricative,
Nasal,
Approximant,
Affricate,
Lateral,
Vowel,
Diphthong,
Silence,
}
impl Phoneme {
#[must_use]
pub fn class(&self) -> PhonemeClass {
match self {
Self::VowelA
| Self::VowelE
| Self::VowelI
| Self::VowelO
| Self::VowelU
| Self::VowelSchwa
| Self::VowelOpenO
| Self::VowelAsh
| Self::VowelNearI
| Self::VowelNearU
| Self::VowelOpenA
| Self::VowelOpenE
| Self::VowelCupV
| Self::VowelBird
| Self::VowelLongI => PhonemeClass::Vowel,
Self::DiphthongAI
| Self::DiphthongAU
| Self::DiphthongOI
| Self::DiphthongEI
| Self::DiphthongOU => PhonemeClass::Diphthong,
Self::PlosiveP
| Self::PlosiveB
| Self::PlosiveT
| Self::PlosiveD
| Self::PlosiveK
| Self::PlosiveG => PhonemeClass::Plosive,
Self::FricativeF
| Self::FricativeV
| Self::FricativeS
| Self::FricativeZ
| Self::FricativeSh
| Self::FricativeZh
| Self::FricativeTh
| Self::FricativeDh
| Self::FricativeH => PhonemeClass::Fricative,
Self::NasalM | Self::NasalN | Self::NasalNg => PhonemeClass::Nasal,
Self::AffricateCh | Self::AffricateJ => PhonemeClass::Affricate,
Self::GlottalStop => PhonemeClass::Plosive,
Self::TapFlap => PhonemeClass::Plosive,
Self::LateralL => PhonemeClass::Lateral,
Self::ApproximantR | Self::ApproximantW | Self::ApproximantJ => {
PhonemeClass::Approximant
}
Self::Silence => PhonemeClass::Silence,
}
}
#[must_use]
pub fn is_voiced(&self) -> bool {
match self {
Self::PlosiveP
| Self::PlosiveT
| Self::PlosiveK
| Self::FricativeF
| Self::FricativeS
| Self::FricativeSh
| Self::FricativeTh
| Self::FricativeH
| Self::AffricateCh
| Self::GlottalStop
| Self::Silence => false,
_ => true,
}
}
#[must_use]
pub fn coarticulation_resistance(&self) -> f32 {
match self {
Self::VowelI | Self::VowelLongI | Self::VowelNearI => 0.9,
Self::VowelU | Self::VowelNearU => 0.85,
Self::FricativeS | Self::FricativeZ | Self::FricativeSh | Self::FricativeZh => 0.85,
Self::AffricateCh | Self::AffricateJ => 0.85,
Self::VowelA | Self::VowelOpenA | Self::VowelAsh => 0.7,
Self::VowelE | Self::VowelO | Self::VowelOpenO | Self::VowelOpenE => 0.7,
Self::PlosiveT | Self::PlosiveD | Self::NasalN | Self::LateralL => 0.75,
Self::PlosiveK | Self::PlosiveG | Self::NasalNg => 0.7,
Self::PlosiveP | Self::PlosiveB | Self::NasalM => 0.65,
Self::FricativeF | Self::FricativeV => 0.6,
Self::FricativeTh | Self::FricativeDh => 0.6,
Self::VowelCupV | Self::VowelBird => 0.5,
Self::ApproximantR | Self::ApproximantJ => 0.55,
Self::ApproximantW => 0.5,
Self::TapFlap => 0.4,
Self::VowelSchwa => 0.2,
Self::FricativeH => 0.15, Self::GlottalStop => 0.1,
Self::Silence => 0.0,
Self::DiphthongAI
| Self::DiphthongAU
| Self::DiphthongOI
| Self::DiphthongEI
| Self::DiphthongOU => 0.6,
}
}
}
#[must_use]
pub fn f2_locus_equation(phoneme: &Phoneme) -> Option<(f32, f32)> {
match phoneme {
Phoneme::PlosiveP | Phoneme::PlosiveB | Phoneme::NasalM => Some((900.0, 0.85)),
Phoneme::PlosiveT | Phoneme::PlosiveD | Phoneme::NasalN | Phoneme::TapFlap => {
Some((1750.0, 0.55))
}
Phoneme::PlosiveK | Phoneme::PlosiveG | Phoneme::NasalNg => Some((2000.0, 0.70)),
_ => None,
}
}
#[must_use]
pub fn phoneme_formants(phoneme: &Phoneme) -> VowelTarget {
match phoneme {
Phoneme::VowelA => VowelTarget::from_vowel(Vowel::A),
Phoneme::VowelOpenA => VowelTarget::with_bandwidths(
[745.0, 1100.0, 2440.0, 3300.0, 3750.0],
[85.0, 90.0, 110.0, 130.0, 150.0],
),
Phoneme::VowelE => VowelTarget::from_vowel(Vowel::E),
Phoneme::VowelI | Phoneme::VowelLongI => VowelTarget::from_vowel(Vowel::I),
Phoneme::VowelO => VowelTarget::from_vowel(Vowel::O),
Phoneme::VowelU => VowelTarget::from_vowel(Vowel::U),
Phoneme::VowelSchwa => VowelTarget::from_vowel(Vowel::Schwa),
Phoneme::VowelBird => VowelTarget::with_bandwidths(
[580.0, 1400.0, 2500.0, 3300.0, 3750.0],
[70.0, 80.0, 100.0, 120.0, 140.0],
),
Phoneme::VowelOpenO => VowelTarget::from_vowel(Vowel::OpenO),
Phoneme::VowelAsh => VowelTarget::from_vowel(Vowel::Ash),
Phoneme::VowelNearI => VowelTarget::from_vowel(Vowel::NearI),
Phoneme::VowelNearU => VowelTarget::from_vowel(Vowel::NearU),
Phoneme::VowelOpenE | Phoneme::VowelCupV => {
VowelTarget::new(600.0, 1770.0, 2500.0, 3300.0, 3750.0)
}
Phoneme::DiphthongAI | Phoneme::DiphthongAU => VowelTarget::from_vowel(Vowel::A),
Phoneme::DiphthongOI => VowelTarget::from_vowel(Vowel::OpenO),
Phoneme::DiphthongEI => VowelTarget::from_vowel(Vowel::E),
Phoneme::DiphthongOU => VowelTarget::from_vowel(Vowel::O),
Phoneme::PlosiveP | Phoneme::PlosiveB | Phoneme::NasalM => {
VowelTarget::new(350.0, 900.0, 2400.0, 3300.0, 3750.0)
}
Phoneme::PlosiveT | Phoneme::PlosiveD | Phoneme::NasalN | Phoneme::LateralL => {
VowelTarget::new(400.0, 1750.0, 2600.0, 3300.0, 3750.0)
}
Phoneme::PlosiveK | Phoneme::PlosiveG | Phoneme::NasalNg => {
VowelTarget::new(350.0, 1800.0, 2500.0, 3300.0, 3750.0)
}
Phoneme::FricativeF | Phoneme::FricativeV => {
VowelTarget::new(350.0, 1050.0, 2400.0, 3300.0, 3750.0)
}
Phoneme::FricativeS | Phoneme::FricativeZ => {
VowelTarget::new(400.0, 1750.0, 2600.0, 3300.0, 3750.0)
}
Phoneme::FricativeSh | Phoneme::FricativeZh => {
VowelTarget::new(350.0, 1600.0, 2500.0, 3300.0, 3750.0)
}
Phoneme::FricativeTh | Phoneme::FricativeDh => {
VowelTarget::new(350.0, 1400.0, 2500.0, 3300.0, 3750.0)
}
Phoneme::FricativeH => VowelTarget::from_vowel(Vowel::Schwa),
Phoneme::AffricateCh | Phoneme::AffricateJ => {
VowelTarget::new(350.0, 1600.0, 2500.0, 3300.0, 3750.0)
}
Phoneme::GlottalStop => VowelTarget::from_vowel(Vowel::Schwa),
Phoneme::TapFlap => VowelTarget::new(400.0, 1750.0, 2600.0, 3300.0, 3750.0),
Phoneme::ApproximantR => VowelTarget::new(350.0, 1300.0, 1600.0, 3300.0, 3750.0),
Phoneme::ApproximantW => VowelTarget::new(300.0, 700.0, 2200.0, 3300.0, 3750.0),
Phoneme::ApproximantJ => VowelTarget::new(280.0, 2200.0, 2900.0, 3300.0, 3750.0),
Phoneme::Silence => VowelTarget::new(500.0, 1500.0, 2500.0, 3300.0, 3750.0),
}
}
#[must_use]
pub fn phoneme_duration(phoneme: &Phoneme) -> f32 {
match phoneme.class() {
PhonemeClass::Vowel => 0.12,
PhonemeClass::Diphthong => 0.18,
PhonemeClass::Plosive => 0.08,
PhonemeClass::Fricative => 0.10,
PhonemeClass::Nasal => 0.08,
PhonemeClass::Affricate => 0.12,
PhonemeClass::Approximant | PhonemeClass::Lateral => 0.07,
PhonemeClass::Silence => 0.05,
}
}
type NoiseGen = crate::rng::Rng;
fn diphthong_end_target(phoneme: &Phoneme) -> VowelTarget {
match phoneme {
Phoneme::DiphthongAI => VowelTarget::from_vowel(Vowel::I),
Phoneme::DiphthongAU => VowelTarget::from_vowel(Vowel::U),
Phoneme::DiphthongOI => VowelTarget::from_vowel(Vowel::I),
Phoneme::DiphthongEI => VowelTarget::from_vowel(Vowel::I),
Phoneme::DiphthongOU => VowelTarget::from_vowel(Vowel::U),
_ => phoneme_formants(phoneme),
}
}
pub fn synthesize_phoneme(
phoneme: &Phoneme,
voice: &VoiceProfile,
sample_rate: f32,
duration: f32,
) -> Result<Vec<f32>> {
if duration <= 0.0 || !duration.is_finite() {
return Err(SvaraError::InvalidDuration(format!(
"duration must be positive and finite, got {duration}"
)));
}
if sample_rate <= 0.0 {
return Err(SvaraError::ArticulationFailed(
"sample_rate must be positive".to_string(),
));
}
let num_samples = (duration * sample_rate) as usize;
if num_samples == 0 {
return Ok(Vec::new());
}
trace!(?phoneme, duration, num_samples, "synthesizing phoneme");
match phoneme {
Phoneme::Silence => Ok(vec![0.0; num_samples]),
_ => match phoneme.class() {
PhonemeClass::Vowel => synthesize_vowel(phoneme, voice, sample_rate, num_samples),
PhonemeClass::Diphthong => {
synthesize_diphthong(phoneme, voice, sample_rate, num_samples)
}
PhonemeClass::Plosive => synthesize_plosive(phoneme, voice, sample_rate, num_samples),
PhonemeClass::Fricative => {
synthesize_fricative(phoneme, voice, sample_rate, num_samples)
}
PhonemeClass::Nasal => synthesize_nasal(phoneme, voice, sample_rate, num_samples),
PhonemeClass::Affricate => {
synthesize_affricate(phoneme, voice, sample_rate, num_samples)
}
PhonemeClass::Approximant | PhonemeClass::Lateral => {
synthesize_approximant(phoneme, voice, sample_rate, num_samples)
}
PhonemeClass::Silence => Ok(vec![0.0; num_samples]),
},
}
}
fn synthesize_vowel(
phoneme: &Phoneme,
voice: &VoiceProfile,
sample_rate: f32,
num_samples: usize,
) -> Result<Vec<f32>> {
let target = voice.apply_formant_scale(&phoneme_formants(phoneme));
let mut tract = VocalTract::new(sample_rate);
tract.set_formants_from_target(&target)?;
let mut glottal = voice
.create_glottal_source(sample_rate)
.map_err(|e| SvaraError::ArticulationFailed(e.to_string()))?;
let mut output = tract.synthesize(&mut glottal, num_samples);
apply_amplitude_envelope(&mut output, num_samples);
Ok(output)
}
fn synthesize_diphthong(
phoneme: &Phoneme,
voice: &VoiceProfile,
sample_rate: f32,
num_samples: usize,
) -> Result<Vec<f32>> {
let start_target = voice.apply_formant_scale(&phoneme_formants(phoneme));
let end_target = voice.apply_formant_scale(&diphthong_end_target(phoneme));
let mut glottal = voice
.create_glottal_source(sample_rate)
.map_err(|e| SvaraError::ArticulationFailed(e.to_string()))?;
let mut tract = VocalTract::new(sample_rate);
let mut output = Vec::with_capacity(num_samples);
for i in 0..num_samples {
let t = i as f32 / num_samples as f32;
let current = VowelTarget::interpolate(&start_target, &end_target, t);
tract.set_formants_from_target(¤t)?;
let excitation = glottal.next_sample();
output.push(tract.process_sample(excitation));
}
apply_amplitude_envelope(&mut output, num_samples);
Ok(output)
}
fn synthesize_plosive(
phoneme: &Phoneme,
voice: &VoiceProfile,
sample_rate: f32,
num_samples: usize,
) -> Result<Vec<f32>> {
let mut output = vec![0.0; num_samples];
let mut noise = NoiseGen::new(17);
let closure_end = num_samples / 3;
let burst_end = closure_end + (num_samples / 6).max(1);
let target = voice.apply_formant_scale(&phoneme_formants(phoneme));
let formants = target.to_formants();
let mut filter = FormantFilter::new(&formants, sample_rate)
.map_err(|e| SvaraError::ArticulationFailed(e.to_string()))?;
for sample in output.iter_mut().take(burst_end).skip(closure_end) {
let n = noise.next_f32() * 0.8;
*sample = filter.process_sample(n);
}
if phoneme.is_voiced() {
let mut glottal = voice
.create_glottal_source(sample_rate)
.map_err(|e| SvaraError::ArticulationFailed(e.to_string()))?;
glottal.set_breathiness(0.4); let mut tract = VocalTract::new(sample_rate);
tract.set_formants_from_target(&target)?;
for sample in output.iter_mut().skip(burst_end) {
let excitation = glottal.next_sample();
*sample = tract.process_sample(excitation) * 0.5;
}
} else {
for sample in output.iter_mut().skip(burst_end) {
let n = noise.next_f32() * 0.3;
*sample = filter.process_sample(n);
}
}
apply_amplitude_envelope(&mut output, num_samples);
Ok(output)
}
fn synthesize_fricative(
phoneme: &Phoneme,
voice: &VoiceProfile,
sample_rate: f32,
num_samples: usize,
) -> Result<Vec<f32>> {
let target = voice.apply_formant_scale(&phoneme_formants(phoneme));
let mut noise = NoiseGen::new(31);
let formants = fricative_formants(phoneme, sample_rate);
let mut filter = FormantFilter::new(&formants, sample_rate)
.map_err(|e| SvaraError::ArticulationFailed(e.to_string()))?;
let mut output = Vec::with_capacity(num_samples);
if phoneme.is_voiced() {
let mut glottal = voice
.create_glottal_source(sample_rate)
.map_err(|e| SvaraError::ArticulationFailed(e.to_string()))?;
let mut tract = VocalTract::new(sample_rate);
tract.set_formants_from_target(&target)?;
for _ in 0..num_samples {
let n = noise.next_f32() * 0.5;
let friction = filter.process_sample(n);
let voicing = tract.process_sample(glottal.next_sample()) * 0.4;
output.push(friction + voicing);
}
} else {
for _ in 0..num_samples {
let n = noise.next_f32() * 0.6;
output.push(filter.process_sample(n));
}
}
apply_amplitude_envelope(&mut output, num_samples);
Ok(output)
}
fn fricative_formants(phoneme: &Phoneme, _sample_rate: f32) -> Vec<Formant> {
match phoneme {
Phoneme::FricativeS | Phoneme::FricativeZ => vec![
Formant::new(4500.0, 500.0, 1.0),
Formant::new(7000.0, 800.0, 0.7),
],
Phoneme::FricativeSh | Phoneme::FricativeZh => vec![
Formant::new(2800.0, 600.0, 1.0),
Formant::new(5000.0, 800.0, 0.6),
],
Phoneme::FricativeF | Phoneme::FricativeV => vec![
Formant::new(3000.0, 2000.0, 0.5),
Formant::new(8000.0, 2000.0, 0.3),
],
Phoneme::FricativeTh | Phoneme::FricativeDh => vec![
Formant::new(4000.0, 1500.0, 0.4),
Formant::new(7500.0, 1500.0, 0.3),
],
Phoneme::FricativeH => vec![
Formant::new(1500.0, 800.0, 0.4),
Formant::new(2500.0, 800.0, 0.3),
],
Phoneme::AffricateCh | Phoneme::AffricateJ => vec![
Formant::new(2800.0, 600.0, 1.0),
Formant::new(5000.0, 800.0, 0.6),
],
_ => vec![Formant::new(3000.0, 1000.0, 0.5)],
}
}
fn synthesize_nasal(
phoneme: &Phoneme,
voice: &VoiceProfile,
sample_rate: f32,
num_samples: usize,
) -> Result<Vec<f32>> {
let target = voice.apply_formant_scale(&phoneme_formants(phoneme));
let mut glottal = voice
.create_glottal_source(sample_rate)
.map_err(|e| SvaraError::ArticulationFailed(e.to_string()))?;
let mut tract = VocalTract::new(sample_rate);
tract.set_formants_from_target(&target)?;
tract.set_nasal_coupling(0.8);
let place = match phoneme {
Phoneme::NasalM => NasalPlace::Bilabial,
Phoneme::NasalN => NasalPlace::Alveolar,
Phoneme::NasalNg => NasalPlace::Velar,
_ => NasalPlace::Neutral,
};
tract.set_nasal_place(place);
let mut output = tract.synthesize(&mut glottal, num_samples);
apply_amplitude_envelope(&mut output, num_samples);
Ok(output)
}
fn synthesize_approximant(
phoneme: &Phoneme,
voice: &VoiceProfile,
sample_rate: f32,
num_samples: usize,
) -> Result<Vec<f32>> {
let target = voice.apply_formant_scale(&phoneme_formants(phoneme));
let mut glottal = voice
.create_glottal_source(sample_rate)
.map_err(|e| SvaraError::ArticulationFailed(e.to_string()))?;
glottal.set_breathiness(voice.breathiness.max(0.1));
let mut tract = VocalTract::new(sample_rate);
tract.set_formants_from_target(&target)?;
let mut output = tract.synthesize(&mut glottal, num_samples);
for sample in &mut output {
*sample *= 0.7;
}
apply_amplitude_envelope(&mut output, num_samples);
Ok(output)
}
fn synthesize_affricate(
phoneme: &Phoneme,
voice: &VoiceProfile,
sample_rate: f32,
num_samples: usize,
) -> Result<Vec<f32>> {
let mut output = vec![0.0; num_samples];
let mut noise = NoiseGen::new(23);
let closure_end = num_samples / 4;
let burst_end = closure_end + (num_samples / 8).max(1);
let target = voice.apply_formant_scale(&phoneme_formants(phoneme));
let fric_formants = fricative_formants(phoneme, sample_rate);
let mut filter = FormantFilter::new(&fric_formants, sample_rate)
.map_err(|e| SvaraError::ArticulationFailed(e.to_string()))?;
for sample in output.iter_mut().take(burst_end).skip(closure_end) {
let n = noise.next_f32() * 0.8;
*sample = filter.process_sample(n);
}
if phoneme.is_voiced() {
let mut glottal = voice
.create_glottal_source(sample_rate)
.map_err(|e| SvaraError::ArticulationFailed(e.to_string()))?;
let mut tract = VocalTract::new(sample_rate);
tract.set_formants_from_target(&target)?;
for sample in output.iter_mut().skip(burst_end) {
let exc = glottal.next_sample();
let voiced = tract.process_sample(exc);
let fric = filter.process_sample(noise.next_f32() * 0.5);
*sample = voiced * 0.5 + fric * 0.5;
}
} else {
for sample in output.iter_mut().skip(burst_end) {
let n = noise.next_f32() * 0.6;
*sample = filter.process_sample(n);
}
}
apply_amplitude_envelope(&mut output, num_samples);
Ok(output)
}
fn apply_amplitude_envelope(samples: &mut [f32], _total: usize) {
let len = samples.len();
if len == 0 {
return;
}
let ramp_len = (len / 10).clamp(1, 256);
for (i, sample) in samples.iter_mut().enumerate().take(ramp_len) {
let t = i as f32 / ramp_len as f32;
*sample *= hisab::calc::ease_in_out_smooth(t);
}
for i in 0..ramp_len {
let idx = len - 1 - i;
let t = i as f32 / ramp_len as f32;
samples[idx] *= hisab::calc::ease_in_out_smooth(t);
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_phoneme_class() {
assert_eq!(Phoneme::VowelA.class(), PhonemeClass::Vowel);
assert_eq!(Phoneme::PlosiveP.class(), PhonemeClass::Plosive);
assert_eq!(Phoneme::FricativeS.class(), PhonemeClass::Fricative);
assert_eq!(Phoneme::NasalM.class(), PhonemeClass::Nasal);
assert_eq!(Phoneme::DiphthongAI.class(), PhonemeClass::Diphthong);
assert_eq!(Phoneme::ApproximantR.class(), PhonemeClass::Approximant);
assert_eq!(Phoneme::LateralL.class(), PhonemeClass::Lateral);
}
#[test]
fn test_voicing() {
assert!(!Phoneme::PlosiveP.is_voiced());
assert!(Phoneme::PlosiveB.is_voiced());
assert!(Phoneme::VowelA.is_voiced());
assert!(!Phoneme::FricativeS.is_voiced());
assert!(Phoneme::FricativeZ.is_voiced());
}
#[test]
fn test_synthesize_vowel() {
let voice = VoiceProfile::new_male();
let result = synthesize_phoneme(&Phoneme::VowelA, &voice, 44100.0, 0.1);
assert!(result.is_ok());
let samples = result.unwrap();
assert!(!samples.is_empty());
assert!(samples.iter().all(|s| s.is_finite()));
assert!(samples.iter().any(|&s| s.abs() > 1e-6));
}
#[test]
fn test_synthesize_fricative() {
let voice = VoiceProfile::new_male();
let result = synthesize_phoneme(&Phoneme::FricativeS, &voice, 44100.0, 0.08);
assert!(result.is_ok());
}
#[test]
fn test_synthesize_plosive() {
let voice = VoiceProfile::new_male();
let result = synthesize_phoneme(&Phoneme::PlosiveP, &voice, 44100.0, 0.08);
assert!(result.is_ok());
}
#[test]
fn test_synthesize_silence() {
let voice = VoiceProfile::new_male();
let result = synthesize_phoneme(&Phoneme::Silence, &voice, 44100.0, 0.05);
assert!(result.is_ok());
let samples = result.unwrap();
assert!(samples.iter().all(|&s| s.abs() < f32::EPSILON));
}
#[test]
fn test_invalid_duration() {
let voice = VoiceProfile::new_male();
assert!(synthesize_phoneme(&Phoneme::VowelA, &voice, 44100.0, -1.0).is_err());
}
#[test]
fn test_serde_roundtrip() {
let p = Phoneme::VowelA;
let json = serde_json::to_string(&p).unwrap();
let p2: Phoneme = serde_json::from_str(&json).unwrap();
assert_eq!(p, p2);
}
}