use std::collections::HashSet;
use std::sync::LazyLock;
use regex::Regex;
use crate::custom_dict::CustomDictionary;
use crate::error::G2pError;
use crate::phonemizer::{Phonemizer, ProsodyInfo};
use crate::token_map::token_to_pua;
static RE_PHONEME: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"-([^+]+)\+").expect("RE_PHONEME"));
static RE_A1: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"/A:([\d-]+)\+").expect("RE_A1"));
static RE_A2: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\+([0-9]+)\+").expect("RE_A2"));
static RE_A3: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\+([0-9]+)/").expect("RE_A3"));
static BILABIAL: LazyLock<HashSet<&'static str>> =
LazyLock::new(|| ["m", "my", "b", "by", "p", "py"].into_iter().collect());
static ALVEOLAR: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
["n", "ny", "t", "ty", "d", "dy", "ts", "ch"]
.into_iter()
.collect()
});
static VELAR: LazyLock<HashSet<&'static str>> =
LazyLock::new(|| ["k", "ky", "kw", "g", "gy", "gw"].into_iter().collect());
static SKIP_TOKENS: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
["_", "#", "[", "]", "^", "$", "?", "?!", "?.", "?~"]
.into_iter()
.collect()
});
pub fn get_question_type(text: &str) -> &'static str {
let stripped = text.trim();
if stripped.ends_with("?!")
|| stripped.ends_with("\u{FF01}\u{FF1F}") || stripped.ends_with("\u{FF1F}\u{FF01}")
{
return "?!";
}
if stripped.ends_with("?.")
|| stripped.ends_with("\u{3002}\u{FF1F}") || stripped.ends_with("\u{FF1F}\u{3002}")
{
return "?.";
}
if stripped.ends_with("?~")
|| stripped.ends_with("\u{FF5E}\u{FF1F}") || stripped.ends_with("\u{FF1F}\u{FF5E}")
{
return "?~";
}
if stripped.ends_with('?') || stripped.ends_with('\u{FF1F}') {
return "?";
}
"$" }
pub fn apply_n_phoneme_rules(tokens: &mut [String]) {
let replacements: Vec<(usize, &str)> = tokens
.iter()
.enumerate()
.filter(|(_, t)| t.as_str() == "N")
.map(|(i, _)| {
let next_phoneme = tokens[i + 1..]
.iter()
.find(|t| !SKIP_TOKENS.contains(t.as_str()))
.map(|s| s.as_str());
let variant = match next_phoneme {
None => "N_uvular",
Some(ph) if BILABIAL.contains(ph) => "N_m",
Some(ph) if ALVEOLAR.contains(ph) => "N_n",
Some(ph) if VELAR.contains(ph) => "N_ng",
Some(_) => "N_uvular",
};
(i, variant)
})
.collect();
for (idx, variant) in replacements {
tokens[idx] = variant.to_string();
}
}
fn map_sequence(tokens: Vec<String>) -> Vec<String> {
tokens
.into_iter()
.map(|t| {
if let Some(pua_char) = token_to_pua(&t) {
pua_char.to_string()
} else {
t
}
})
.collect()
}
fn parse_phoneme(label_str: &str) -> Option<String> {
RE_PHONEME
.captures(label_str)
.and_then(|c| c.get(1))
.map(|m| m.as_str().to_string())
}
fn parse_a1(label_str: &str) -> Option<i32> {
RE_A1
.captures(label_str)
.and_then(|c| c.get(1))
.and_then(|m| m.as_str().parse::<i32>().ok())
}
fn parse_a2(label_str: &str) -> Option<i32> {
RE_A2
.captures(label_str)
.and_then(|c| c.get(1))
.and_then(|m| m.as_str().parse::<i32>().ok())
}
fn parse_a3(label_str: &str) -> Option<i32> {
RE_A3
.captures(label_str)
.and_then(|c| c.get(1))
.and_then(|m| m.as_str().parse::<i32>().ok())
}
fn labels_to_tokens_with_prosody(
label_strings: &[String],
text: &str,
) -> (Vec<String>, Vec<Option<ProsodyInfo>>) {
let mut tokens: Vec<String> = Vec::new();
let mut prosody_info: Vec<Option<ProsodyInfo>> = Vec::new();
let num_labels = label_strings.len();
for (idx, label) in label_strings.iter().enumerate() {
let phoneme = match parse_phoneme(label) {
Some(p) => p,
None => continue,
};
if phoneme == "sil" {
if idx == 0 {
tokens.push("^".to_string());
prosody_info.push(None);
} else if idx == num_labels - 1 {
let q = get_question_type(text);
tokens.push(q.to_string());
prosody_info.push(None);
}
continue;
}
if phoneme == "pau" {
tokens.push("_".to_string());
prosody_info.push(None);
continue;
}
tokens.push(phoneme);
let a1_opt = parse_a1(label);
let a2_opt = parse_a2(label);
let a3_opt = parse_a3(label);
if let (Some(a1), Some(a2), Some(a3)) = (a1_opt, a2_opt, a3_opt) {
prosody_info.push(Some(ProsodyInfo { a1, a2, a3 }));
let a2_next = if idx < num_labels - 1 {
parse_a2(&label_strings[idx + 1]).unwrap_or(-1)
} else {
-1
};
if a1 == 0 && a2_next == a2 + 1 {
tokens.push("]".to_string());
prosody_info.push(None);
}
if a2 == a3 && a2_next == 1 {
tokens.push("#".to_string());
prosody_info.push(None);
}
if a2 == 1 && a2_next == 2 {
tokens.push("[".to_string());
prosody_info.push(None);
}
} else {
prosody_info.push(None);
}
}
apply_n_phoneme_rules(&mut tokens);
let mapped = map_sequence(tokens);
(mapped, prosody_info)
}
pub struct JapanesePhonemizer {
njd: jpreprocess::JPreprocess<jpreprocess::DefaultFetcher>,
dictionary: Option<CustomDictionary>,
}
impl JapanesePhonemizer {
#[cfg(feature = "naist-jdic")]
pub fn new_bundled() -> Result<Self, G2pError> {
let config = jpreprocess::JPreprocessConfig {
dictionary: jpreprocess::SystemDictionaryConfig::Bundled(
jpreprocess::kind::JPreprocessDictionaryKind::NaistJdic,
),
user_dictionary: None,
};
let njd = jpreprocess::JPreprocess::from_config(config)
.map_err(|e| G2pError::JPreprocessInit(e.to_string()))?;
Ok(Self {
njd,
dictionary: None,
})
}
pub fn new() -> Result<Self, G2pError> {
let dict_path = Self::find_dictionary()?;
Self::new_with_dict(&dict_path)
}
pub fn new_with_dict(dict_path: &std::path::Path) -> Result<Self, G2pError> {
let config = jpreprocess::JPreprocessConfig {
dictionary: jpreprocess::SystemDictionaryConfig::File(dict_path.to_path_buf()),
user_dictionary: None,
};
let njd = jpreprocess::JPreprocess::from_config(config)
.map_err(|e| G2pError::JPreprocessInit(e.to_string()))?;
Ok(Self {
njd,
dictionary: None,
})
}
pub fn new_from_serialized_dict(data: &[u8]) -> Result<Self, G2pError> {
let dictionary: jpreprocess::Dictionary =
bincode::deserialize(data).map_err(|e| G2pError::DictionaryLoad {
path: format!("(serialized, {} bytes): {e}", data.len()),
})?;
let njd = jpreprocess::JPreprocess::with_dictionaries(dictionary, None);
Ok(Self {
njd,
dictionary: None,
})
}
fn find_dictionary() -> Result<std::path::PathBuf, G2pError> {
if let Ok(path) = std::env::var("JPREPROCESS_DICT") {
let p = std::path::PathBuf::from(path);
if p.exists() {
return Ok(p);
}
}
let local = std::path::PathBuf::from("jpreprocess-naist-jdic");
if local.exists() {
return Ok(local);
}
let system = std::path::PathBuf::from("/usr/share/jpreprocess/naist-jdic");
if system.exists() {
return Ok(system);
}
Err(G2pError::JPreprocessInit(
"NAIST-JDIC dictionary not found. Set JPREPROCESS_DICT env var \
or place dictionary at ./jpreprocess-naist-jdic/"
.to_string(),
))
}
pub fn set_dictionary(&mut self, dict: CustomDictionary) {
self.dictionary = Some(dict);
}
fn extract_labels(&self, text: &str) -> Result<Vec<String>, G2pError> {
let labels = self
.njd
.extract_fullcontext(text)
.map_err(|e| G2pError::Phonemize(e.to_string()))?;
Ok(labels.iter().map(|l| l.to_string()).collect())
}
}
impl Phonemizer for JapanesePhonemizer {
fn phonemize_with_prosody(
&self,
text: &str,
) -> Result<(Vec<String>, Vec<Option<ProsodyInfo>>), G2pError> {
let mut input = text.to_string();
if let Some(ref dict) = self.dictionary {
input = dict.apply_to_text(&input);
}
let label_strings = self.extract_labels(&input)?;
Ok(labels_to_tokens_with_prosody(&label_strings, text))
}
fn language_code(&self) -> &str {
"ja"
}
}
#[cfg(test)]
mod tests {
use super::*;
fn vec_s(items: &[&str]) -> Vec<String> {
items.iter().map(|s| s.to_string()).collect()
}
#[test]
fn test_question_type_declarative_period() {
assert_eq!(get_question_type("こんにちは。"), "$");
}
#[test]
fn test_question_type_declarative_plain() {
assert_eq!(get_question_type("何もない"), "$");
}
#[test]
fn test_question_type_declarative_whitespace() {
assert_eq!(get_question_type(" "), "$");
}
#[test]
fn test_question_type_generic_ascii() {
assert_eq!(get_question_type("本当?"), "?");
}
#[test]
fn test_question_type_generic_fullwidth() {
assert_eq!(get_question_type("本当?"), "?");
}
#[test]
fn test_question_type_emphatic_ascii() {
assert_eq!(get_question_type("本当?!"), "?!");
}
#[test]
fn test_question_type_emphatic_fullwidth() {
assert_eq!(get_question_type("本当!?"), "?!");
assert_eq!(get_question_type("本当?!"), "?!");
}
#[test]
fn test_question_type_neutral_ascii() {
assert_eq!(get_question_type("そう?."), "?.");
}
#[test]
fn test_question_type_neutral_fullwidth() {
assert_eq!(get_question_type("そう。?"), "?.");
assert_eq!(get_question_type("そう?。"), "?.");
}
#[test]
fn test_question_type_tag_ascii() {
assert_eq!(get_question_type("行くよね?~"), "?~");
}
#[test]
fn test_question_type_tag_fullwidth() {
assert_eq!(get_question_type("行くよね~?"), "?~");
assert_eq!(get_question_type("行くよね?~"), "?~");
}
#[test]
fn test_question_type_with_trailing_whitespace() {
assert_eq!(get_question_type("本当? "), "?");
assert_eq!(get_question_type("本当?! "), "?!");
}
#[test]
fn test_n_before_bilabial_p() {
let mut tokens = vec_s(&["^", "s", "a", "N", "p", "o", "$"]);
apply_n_phoneme_rules(&mut tokens);
assert_eq!(tokens[3], "N_m");
}
#[test]
fn test_n_before_bilabial_m() {
let mut tokens = vec_s(&["^", "a", "N", "m", "a", "$"]);
apply_n_phoneme_rules(&mut tokens);
assert_eq!(tokens[2], "N_m");
}
#[test]
fn test_n_before_bilabial_b() {
let mut tokens = vec_s(&["^", "a", "N", "b", "a", "$"]);
apply_n_phoneme_rules(&mut tokens);
assert_eq!(tokens[2], "N_m");
}
#[test]
fn test_n_before_alveolar_n() {
let mut tokens = vec_s(&["^", "a", "N", "n", "a", "i", "$"]);
apply_n_phoneme_rules(&mut tokens);
assert_eq!(tokens[2], "N_n");
}
#[test]
fn test_n_before_alveolar_t() {
let mut tokens = vec_s(&["^", "a", "N", "t", "a", "$"]);
apply_n_phoneme_rules(&mut tokens);
assert_eq!(tokens[2], "N_n");
}
#[test]
fn test_n_before_alveolar_ts() {
let mut tokens = vec_s(&["^", "N", "ts", "u", "$"]);
apply_n_phoneme_rules(&mut tokens);
assert_eq!(tokens[1], "N_n");
}
#[test]
fn test_n_before_alveolar_ch() {
let mut tokens = vec_s(&["^", "N", "ch", "i", "$"]);
apply_n_phoneme_rules(&mut tokens);
assert_eq!(tokens[1], "N_n");
}
#[test]
fn test_n_before_alveolar_ny() {
let mut tokens = vec_s(&["^", "N", "ny", "a", "$"]);
apply_n_phoneme_rules(&mut tokens);
assert_eq!(tokens[1], "N_n");
}
#[test]
fn test_n_before_alveolar_dy() {
let mut tokens = vec_s(&["^", "N", "dy", "a", "$"]);
apply_n_phoneme_rules(&mut tokens);
assert_eq!(tokens[1], "N_n");
}
#[test]
fn test_n_before_velar_k() {
let mut tokens = vec_s(&["^", "g", "i", "N", "k", "o", "o", "$"]);
apply_n_phoneme_rules(&mut tokens);
assert_eq!(tokens[3], "N_ng");
}
#[test]
fn test_n_before_velar_g() {
let mut tokens = vec_s(&["^", "a", "N", "g", "a", "$"]);
apply_n_phoneme_rules(&mut tokens);
assert_eq!(tokens[2], "N_ng");
}
#[test]
fn test_n_before_velar_ky() {
let mut tokens = vec_s(&["^", "a", "N", "ky", "o", "$"]);
apply_n_phoneme_rules(&mut tokens);
assert_eq!(tokens[2], "N_ng");
}
#[test]
fn test_n_before_velar_kw() {
let mut tokens = vec_s(&["^", "N", "kw", "a", "$"]);
apply_n_phoneme_rules(&mut tokens);
assert_eq!(tokens[1], "N_ng");
}
#[test]
fn test_n_before_velar_gw() {
let mut tokens = vec_s(&["^", "N", "gw", "a", "$"]);
apply_n_phoneme_rules(&mut tokens);
assert_eq!(tokens[1], "N_ng");
}
#[test]
fn test_n_before_velar_gy() {
let mut tokens = vec_s(&["^", "N", "gy", "a", "$"]);
apply_n_phoneme_rules(&mut tokens);
assert_eq!(tokens[1], "N_ng");
}
#[test]
fn test_n_before_palatalized_bilabial_my() {
let mut tokens = vec_s(&["^", "N", "my", "a", "$"]);
apply_n_phoneme_rules(&mut tokens);
assert_eq!(tokens[1], "N_m");
}
#[test]
fn test_n_before_palatalized_bilabial_by() {
let mut tokens = vec_s(&["^", "N", "by", "a", "$"]);
apply_n_phoneme_rules(&mut tokens);
assert_eq!(tokens[1], "N_m");
}
#[test]
fn test_n_before_palatalized_bilabial_py() {
let mut tokens = vec_s(&["^", "N", "py", "a", "$"]);
apply_n_phoneme_rules(&mut tokens);
assert_eq!(tokens[1], "N_m");
}
#[test]
fn test_n_at_end_of_phrase() {
let mut tokens = vec_s(&["^", "h", "o", "N", "$"]);
apply_n_phoneme_rules(&mut tokens);
assert_eq!(tokens[3], "N_uvular");
}
#[test]
fn test_n_before_vowel() {
let mut tokens = vec_s(&["^", "N", "a", "$"]);
apply_n_phoneme_rules(&mut tokens);
assert_eq!(tokens[1], "N_uvular");
}
#[test]
fn test_n_before_fricative_s() {
let mut tokens = vec_s(&["^", "N", "s", "a", "$"]);
apply_n_phoneme_rules(&mut tokens);
assert_eq!(tokens[1], "N_uvular");
}
#[test]
fn test_n_before_h() {
let mut tokens = vec_s(&["^", "N", "h", "a", "$"]);
apply_n_phoneme_rules(&mut tokens);
assert_eq!(tokens[1], "N_uvular");
}
#[test]
fn test_n_before_r() {
let mut tokens = vec_s(&["^", "N", "r", "a", "$"]);
apply_n_phoneme_rules(&mut tokens);
assert_eq!(tokens[1], "N_uvular");
}
#[test]
fn test_n_before_w() {
let mut tokens = vec_s(&["^", "N", "w", "a", "$"]);
apply_n_phoneme_rules(&mut tokens);
assert_eq!(tokens[1], "N_uvular");
}
#[test]
fn test_n_skips_prosody_marks() {
let mut tokens = vec_s(&["^", "N", "#", "[", "k", "a", "$"]);
apply_n_phoneme_rules(&mut tokens);
assert_eq!(tokens[1], "N_ng");
}
#[test]
fn test_n_skips_pause() {
let mut tokens = vec_s(&["^", "N", "_", "p", "a", "$"]);
apply_n_phoneme_rules(&mut tokens);
assert_eq!(tokens[1], "N_m");
}
#[test]
fn test_multiple_n_replacements() {
let mut tokens = vec_s(&["^", "N", "k", "a", "#", "N", "p", "o", "$"]);
apply_n_phoneme_rules(&mut tokens);
assert_eq!(tokens[1], "N_ng"); assert_eq!(tokens[5], "N_m"); }
#[test]
fn test_non_n_tokens_unchanged() {
let mut tokens = vec_s(&["^", "k", "o", "N", "n", "i", "ch", "i", "w", "a", "$"]);
apply_n_phoneme_rules(&mut tokens);
assert_eq!(tokens[1], "k"); assert_eq!(tokens[6], "ch"); assert_eq!(tokens[3], "N_n"); }
#[test]
fn test_map_sequence_long_vowels() {
let tokens = vec_s(&["a:", "i:", "u:", "e:", "o:"]);
let mapped = map_sequence(tokens);
assert_eq!(mapped[0], "\u{E000}");
assert_eq!(mapped[1], "\u{E001}");
assert_eq!(mapped[2], "\u{E002}");
assert_eq!(mapped[3], "\u{E003}");
assert_eq!(mapped[4], "\u{E004}");
}
#[test]
fn test_map_sequence_n_variants() {
let tokens = vec_s(&["N_m", "N_n", "N_ng", "N_uvular"]);
let mapped = map_sequence(tokens);
assert_eq!(mapped[0], "\u{E019}");
assert_eq!(mapped[1], "\u{E01A}");
assert_eq!(mapped[2], "\u{E01B}");
assert_eq!(mapped[3], "\u{E01C}");
}
#[test]
fn test_map_sequence_question_marks() {
let tokens = vec_s(&["?!", "?.", "?~"]);
let mapped = map_sequence(tokens);
assert_eq!(mapped[0], "\u{E016}");
assert_eq!(mapped[1], "\u{E017}");
assert_eq!(mapped[2], "\u{E018}");
}
#[test]
fn test_map_sequence_palatalized_consonants() {
let tokens = vec_s(&["ky", "gy", "ny", "ch", "ts", "sh"]);
let mapped = map_sequence(tokens);
assert_eq!(mapped[0], "\u{E006}"); assert_eq!(mapped[1], "\u{E008}"); assert_eq!(mapped[2], "\u{E013}"); assert_eq!(mapped[3], "\u{E00E}"); assert_eq!(mapped[4], "\u{E00F}"); assert_eq!(mapped[5], "\u{E010}"); }
#[test]
fn test_map_sequence_cl() {
let tokens = vec_s(&["cl"]);
let mapped = map_sequence(tokens);
assert_eq!(mapped[0], "\u{E005}");
}
#[test]
fn test_map_sequence_single_char_preserved() {
let tokens = vec_s(&["a", "k", "^", "$", "?", "s", "N", "_", "#"]);
let mapped = map_sequence(tokens);
assert_eq!(
mapped,
vec_s(&["a", "k", "^", "$", "?", "s", "N", "_", "#"])
);
}
#[test]
fn test_map_sequence_mixed() {
let tokens = vec_s(&["^", "k", "o", "N_ng", "#", "a:", "$"]);
let mapped = map_sequence(tokens);
assert_eq!(mapped[0], "^");
assert_eq!(mapped[1], "k");
assert_eq!(mapped[2], "o");
assert_eq!(mapped[3], "\u{E01B}"); assert_eq!(mapped[4], "#");
assert_eq!(mapped[5], "\u{E000}"); assert_eq!(mapped[6], "$");
}
#[test]
fn test_parse_phoneme_sil() {
let label = "xx^xx-sil+xx=xx/A:xx+xx+xx/B:xx-xx_xx/C:xx_xx+xx";
assert_eq!(parse_phoneme(label), Some("sil".to_string()));
}
#[test]
fn test_parse_phoneme_regular() {
let label = "xx^s-a+N=xx/A:0+2+5/B:xx-xx_xx/C:xx_xx+xx";
assert_eq!(parse_phoneme(label), Some("a".to_string()));
}
#[test]
fn test_parse_phoneme_no_match() {
let label = "no phoneme here";
assert_eq!(parse_phoneme(label), None);
}
#[test]
fn test_parse_a1_negative() {
let label = "xx/A:-3+1+7/B:xx";
assert_eq!(parse_a1(label), Some(-3));
}
#[test]
fn test_parse_a1_zero() {
let label = "xx/A:0+2+5/B:xx";
assert_eq!(parse_a1(label), Some(0));
}
#[test]
fn test_parse_a1_positive() {
let label = "xx/A:4+2+5/B:xx";
assert_eq!(parse_a1(label), Some(4));
}
#[test]
fn test_parse_a2() {
let label = "xx/A:-3+1+7/B:xx";
assert_eq!(parse_a2(label), Some(1));
}
#[test]
fn test_parse_a3() {
let label = "xx/A:-3+1+7/B:xx";
assert_eq!(parse_a3(label), Some(7));
}
#[test]
fn test_parse_a_values_large() {
let label = "xx/A:-12+15+20/B:xx";
assert_eq!(parse_a1(label), Some(-12));
assert_eq!(parse_a2(label), Some(15));
assert_eq!(parse_a3(label), Some(20));
}
#[test]
fn test_bos_eos_from_sil() {
let labels = vec![
"xx^xx-sil+xx=xx/A:xx+xx+xx/B:xx".to_string(),
"xx^xx-sil+xx=xx/A:xx+xx+xx/B:xx".to_string(),
];
let (tokens, prosody) = labels_to_tokens_with_prosody(&labels, "テスト。");
assert_eq!(tokens[0], "^");
assert_eq!(tokens.last().unwrap(), "$");
assert!(prosody[0].is_none());
assert!(prosody.last().unwrap().is_none());
}
#[test]
fn test_question_ending_in_labels() {
let labels = vec![
"xx^xx-sil+xx=xx/A:xx+xx+xx/B:xx".to_string(),
"xx^xx-sil+xx=xx/A:xx+xx+xx/B:xx".to_string(),
];
let (tokens, _) = labels_to_tokens_with_prosody(&labels, "本当?");
assert_eq!(tokens.last().unwrap(), "?");
}
#[test]
fn test_emphatic_question_ending_pua_mapped() {
let labels = vec![
"xx^xx-sil+xx=xx/A:xx+xx+xx/B:xx".to_string(),
"xx^xx-sil+xx=xx/A:xx+xx+xx/B:xx".to_string(),
];
let (tokens, _) = labels_to_tokens_with_prosody(&labels, "本当?!");
assert_eq!(tokens.last().unwrap(), "\u{E016}");
}
#[test]
fn test_pau_becomes_underscore() {
let labels = vec![
"xx^xx-sil+xx=xx/A:xx+xx+xx/B:xx".to_string(),
"xx^xx-pau+xx=xx/A:xx+xx+xx/B:xx".to_string(),
"xx^xx-sil+xx=xx/A:xx+xx+xx/B:xx".to_string(),
];
let (tokens, _) = labels_to_tokens_with_prosody(&labels, "テスト。");
assert_eq!(tokens[1], "_");
}
#[test]
fn test_accent_nucleus_mark_inserted() {
let labels = vec![
"xx^xx-sil+xx=xx/A:xx+xx+xx/B:xx".to_string(),
"xx^xx-k+xx=xx/A:0+2+5/B:xx".to_string(),
"xx^xx-o+xx=xx/A:1+3+5/B:xx".to_string(),
"xx^xx-sil+xx=xx/A:xx+xx+xx/B:xx".to_string(),
];
let (tokens, _) = labels_to_tokens_with_prosody(&labels, "テスト。");
assert!(tokens.contains(&"]".to_string()));
}
#[test]
fn test_phrase_boundary_mark_inserted() {
let labels = vec![
"xx^xx-sil+xx=xx/A:xx+xx+xx/B:xx".to_string(),
"xx^xx-k+xx=xx/A:1+3+3/B:xx".to_string(),
"xx^xx-a+xx=xx/A:0+1+2/B:xx".to_string(),
"xx^xx-sil+xx=xx/A:xx+xx+xx/B:xx".to_string(),
];
let (tokens, _) = labels_to_tokens_with_prosody(&labels, "テスト。");
assert!(tokens.contains(&"#".to_string()));
}
#[test]
fn test_rising_mark_inserted() {
let labels = vec![
"xx^xx-sil+xx=xx/A:xx+xx+xx/B:xx".to_string(),
"xx^xx-k+xx=xx/A:1+1+3/B:xx".to_string(),
"xx^xx-o+xx=xx/A:0+2+3/B:xx".to_string(),
"xx^xx-sil+xx=xx/A:xx+xx+xx/B:xx".to_string(),
];
let (tokens, _) = labels_to_tokens_with_prosody(&labels, "テスト。");
assert!(tokens.contains(&"[".to_string()));
}
#[test]
fn test_prosody_info_attached_to_phonemes_only() {
let labels = vec![
"xx^xx-sil+xx=xx/A:xx+xx+xx/B:xx".to_string(),
"xx^xx-k+xx=xx/A:-2+1+5/B:xx".to_string(),
"xx^xx-sil+xx=xx/A:xx+xx+xx/B:xx".to_string(),
];
let (_, prosody) = labels_to_tokens_with_prosody(&labels, "テスト。");
assert!(prosody[0].is_none());
let pi = prosody[1].unwrap();
assert_eq!(pi.a1, -2);
assert_eq!(pi.a2, 1);
assert_eq!(pi.a3, 5);
assert!(prosody[2].is_none());
}
#[test]
fn test_prosody_marks_have_none_prosody() {
let labels = vec![
"xx^xx-sil+xx=xx/A:xx+xx+xx/B:xx".to_string(),
"xx^xx-k+xx=xx/A:0+2+5/B:xx".to_string(),
"xx^xx-o+xx=xx/A:1+3+5/B:xx".to_string(),
"xx^xx-sil+xx=xx/A:xx+xx+xx/B:xx".to_string(),
];
let (tokens, prosody) = labels_to_tokens_with_prosody(&labels, "テスト。");
if let Some(idx) = tokens.iter().position(|t| t == "]") {
assert!(prosody[idx].is_none());
}
}
#[test]
fn test_n_variant_applied_in_label_flow() {
let labels = vec![
"xx^xx-sil+xx=xx/A:xx+xx+xx/B:xx".to_string(),
"xx^xx-N+xx=xx/A:1+1+2/B:xx".to_string(),
"xx^xx-k+xx=xx/A:0+2+2/B:xx".to_string(),
"xx^xx-sil+xx=xx/A:xx+xx+xx/B:xx".to_string(),
];
let (tokens, _) = labels_to_tokens_with_prosody(&labels, "テスト。");
assert!(
tokens.contains(&"\u{E01B}".to_string()),
"Expected N_ng PUA char in tokens: {:?}",
tokens
);
}
#[test]
fn test_mid_sil_is_skipped() {
let labels = vec![
"xx^xx-sil+xx=xx/A:xx+xx+xx/B:xx".to_string(),
"xx^xx-k+xx=xx/A:1+1+2/B:xx".to_string(),
"xx^xx-sil+xx=xx/A:xx+xx+xx/B:xx".to_string(), "xx^xx-a+xx=xx/A:0+1+2/B:xx".to_string(),
"xx^xx-sil+xx=xx/A:xx+xx+xx/B:xx".to_string(),
];
let (tokens, _) = labels_to_tokens_with_prosody(&labels, "テスト。");
assert_eq!(tokens[0], "^");
assert_eq!(*tokens.last().unwrap(), "$");
let bos_count = tokens.iter().filter(|t| t.as_str() == "^").count();
assert_eq!(bos_count, 1);
}
#[test]
fn test_tokens_and_prosody_same_length() {
let labels = vec![
"xx^xx-sil+xx=xx/A:xx+xx+xx/B:xx".to_string(),
"xx^xx-k+xx=xx/A:0+1+3/B:xx".to_string(),
"xx^xx-o+xx=xx/A:1+2+3/B:xx".to_string(),
"xx^xx-N+xx=xx/A:2+3+3/B:xx".to_string(),
"xx^xx-pau+xx=xx/A:xx+xx+xx/B:xx".to_string(),
"xx^xx-n+xx=xx/A:0+1+2/B:xx".to_string(),
"xx^xx-i+xx=xx/A:1+2+2/B:xx".to_string(),
"xx^xx-sil+xx=xx/A:xx+xx+xx/B:xx".to_string(),
];
let (tokens, prosody) = labels_to_tokens_with_prosody(&labels, "テスト。");
assert_eq!(
tokens.len(),
prosody.len(),
"tokens ({}) and prosody ({}) length mismatch.\ntokens: {:?}",
tokens.len(),
prosody.len(),
tokens,
);
}
#[test]
fn test_full_pipeline_with_n_and_prosody() {
let labels = vec![
"xx^xx-sil+xx=xx/A:xx+xx+xx/B:xx".to_string(),
"xx^xx-k+xx=xx/A:1+1+2/B:xx".to_string(),
"xx^xx-o+xx=xx/A:0+2+2/B:xx".to_string(),
"xx^xx-N+xx=xx/A:1+1+3/B:xx".to_string(),
"xx^xx-k+xx=xx/A:0+2+3/B:xx".to_string(),
"xx^xx-a+xx=xx/A:1+3+3/B:xx".to_string(),
"xx^xx-sil+xx=xx/A:xx+xx+xx/B:xx".to_string(),
];
let (tokens, prosody) = labels_to_tokens_with_prosody(&labels, "テスト。");
assert_eq!(tokens.len(), prosody.len());
assert!(
tokens.contains(&"\u{E01B}".to_string()),
"Expected N_ng PUA char in tokens: {:?}",
tokens
);
assert_eq!(tokens[0], "^");
assert_eq!(*tokens.last().unwrap(), "$");
}
#[test]
fn test_full_pipeline_long_vowel_mapped() {
let labels = vec![
"xx^xx-sil+xx=xx/A:xx+xx+xx/B:xx".to_string(),
"xx^xx-a:+xx=xx/A:0+1+2/B:xx".to_string(),
"xx^xx-sil+xx=xx/A:xx+xx+xx/B:xx".to_string(),
];
let (tokens, _) = labels_to_tokens_with_prosody(&labels, "テスト。");
assert!(
tokens.contains(&"\u{E000}".to_string()),
"Expected a: PUA char in tokens: {:?}",
tokens
);
}
#[test]
fn test_full_pipeline_cl_mapped() {
let labels = vec![
"xx^xx-sil+xx=xx/A:xx+xx+xx/B:xx".to_string(),
"xx^xx-cl+xx=xx/A:1+1+2/B:xx".to_string(),
"xx^xx-t+xx=xx/A:0+2+2/B:xx".to_string(),
"xx^xx-sil+xx=xx/A:xx+xx+xx/B:xx".to_string(),
];
let (tokens, _) = labels_to_tokens_with_prosody(&labels, "テスト。");
assert!(
tokens.contains(&"\u{E005}".to_string()),
"Expected cl PUA char in tokens: {:?}",
tokens
);
}
#[test]
fn test_empty_labels() {
let labels: Vec<String> = vec![];
let (tokens, prosody) = labels_to_tokens_with_prosody(&labels, "");
assert!(tokens.is_empty());
assert!(prosody.is_empty());
}
}