use std::collections::BTreeSet;
#[derive(Clone, Debug, Default)]
pub struct Underscore {
pub is_head: bool,
pub alias: Option<String>,
pub stress: Option<f32>,
pub currency: Option<char>,
pub num_flags: String,
pub prespace: bool,
pub rating: Option<u8>,
}
#[derive(Clone, Debug)]
pub struct MToken {
pub text: String,
pub tag: String,
pub whitespace: String,
pub phonemes: Option<String>,
pub underscore: Underscore,
}
impl MToken {
pub fn new(
text: impl Into<String>,
tag: impl Into<String>,
whitespace: impl Into<String>,
) -> Self {
Self {
text: text.into(),
tag: tag.into(),
whitespace: whitespace.into(),
phonemes: None,
underscore: Underscore::default(),
}
}
}
#[derive(Clone, Debug, Default)]
pub struct TokenContext {
pub future_vowel: Option<bool>,
pub future_to: bool,
}
const DIPHTHONGS: &[char] = &['A', 'I', 'O', 'Q', 'W', 'Y', '\u{02A4}', '\u{02A7}'];
pub fn stress_weight(ps: Option<&str>) -> usize {
match ps {
None => 0,
Some(s) => s
.chars()
.map(|c| if DIPHTHONGS.contains(&c) { 2 } else { 1 })
.sum(),
}
}
pub fn merge_tokens(tokens: &[MToken], unk: Option<&str>) -> MToken {
assert!(!tokens.is_empty(), "merge_tokens called with empty slice");
let stresses: BTreeSet<u32> = tokens
.iter()
.filter_map(|tk| tk.underscore.stress.map(|s: f32| s.to_bits()))
.collect();
let currencies: BTreeSet<char> = tokens
.iter()
.filter_map(|tk| tk.underscore.currency)
.collect();
let ratings: Vec<Option<u8>> = tokens.iter().map(|tk| tk.underscore.rating).collect();
let num_flags: String = {
let mut chars: BTreeSet<char> = BTreeSet::new();
for tk in tokens {
for c in tk.underscore.num_flags.chars() {
chars.insert(c);
}
}
chars.into_iter().collect()
};
let phonemes = match unk {
None => None,
Some(fallback) => {
let mut out = String::new();
for tk in tokens {
if tk.underscore.prespace
&& !out.is_empty()
&& !out.ends_with(char::is_whitespace)
&& tk.phonemes.is_some()
{
out.push(' ');
}
match &tk.phonemes {
Some(p) => out.push_str(p),
None => out.push_str(fallback),
}
}
Some(out)
}
};
let text = {
let mut t = String::new();
for (i, tk) in tokens.iter().enumerate() {
t.push_str(&tk.text);
if i < tokens.len() - 1 {
t.push_str(&tk.whitespace);
}
}
t
};
let tag = tokens
.iter()
.max_by_key(|tk| {
tk.text
.chars()
.map(|c: char| if c.is_lowercase() { 1usize } else { 2 })
.sum::<usize>()
})
.unwrap()
.tag
.clone();
let merged_stress = if stresses.len() == 1 {
Some(f32::from_bits(stresses.into_iter().next().unwrap()))
} else {
None
};
let merged_currency = if currencies.is_empty() {
None
} else {
currencies.into_iter().max()
};
let merged_rating = if ratings.contains(&None) {
None
} else {
ratings.into_iter().flatten().min()
};
MToken {
text,
tag,
whitespace: tokens.last().unwrap().whitespace.clone(),
phonemes,
underscore: Underscore {
is_head: tokens[0].underscore.is_head,
alias: None,
stress: merged_stress,
currency: merged_currency,
num_flags,
prespace: tokens[0].underscore.prespace,
rating: merged_rating,
},
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_stress_weight_none() {
assert_eq!(stress_weight(None), 0);
}
#[test]
fn test_stress_weight_plain() {
assert_eq!(stress_weight(Some("abc")), 3);
}
#[test]
fn test_stress_weight_diphthongs() {
assert_eq!(stress_weight(Some("Ab")), 3);
}
#[test]
fn test_merge_tokens_basic() {
let tokens = vec![
MToken {
text: "can".into(),
tag: "NN".into(),
whitespace: "".into(),
phonemes: Some("k\u{00E6}n".into()),
underscore: Underscore::default(),
},
MToken {
text: "not".into(),
tag: "RB".into(),
whitespace: " ".into(),
phonemes: Some("n\u{0251}t".into()),
underscore: Underscore::default(),
},
];
let merged = merge_tokens(&tokens, Some("?"));
assert_eq!(merged.text, "cannot");
assert_eq!(merged.phonemes, Some("k\u{00E6}nn\u{0251}t".into()));
assert_eq!(merged.whitespace, " ");
}
#[test]
fn test_merge_tokens_prespace() {
let mut tk1 = MToken::new("a", "DT", " ");
tk1.phonemes = Some("\u{0259}".into());
let mut tk2 = MToken::new("b", "NN", "");
tk2.phonemes = Some("bi".into());
tk2.underscore.prespace = true;
let merged = merge_tokens(&[tk1, tk2], Some("?"));
assert_eq!(merged.phonemes, Some("\u{0259} bi".into()));
}
#[test]
fn test_merge_tokens_unk_none() {
let tokens = vec![MToken::new("hello", "NN", " ")];
let merged = merge_tokens(&tokens, None);
assert!(merged.phonemes.is_none());
}
#[test]
fn test_merge_tokens_unk_fallback() {
let tokens = vec![MToken::new("xyz", "NN", "")];
let merged = merge_tokens(&tokens, Some("?"));
assert_eq!(merged.phonemes, Some("?".into()));
}
#[test]
fn test_merge_tokens_rating() {
let mut tk1 = MToken::new("a", "X", "");
tk1.underscore.rating = Some(3);
let mut tk2 = MToken::new("b", "X", "");
tk2.underscore.rating = Some(5);
let merged = merge_tokens(&[tk1, tk2], None);
assert_eq!(merged.underscore.rating, Some(3));
}
#[test]
fn test_merge_tokens_rating_with_none() {
let mut tk1 = MToken::new("a", "X", "");
tk1.underscore.rating = Some(3);
let tk2 = MToken::new("b", "X", "");
let merged = merge_tokens(&[tk1, tk2], None);
assert_eq!(merged.underscore.rating, None);
}
}