mod english;
#[cfg(test)]
mod tests;
mod utils;
use std::borrow::Cow;
use std::ops::Range;
use haqumei_kanalizer::{ConvertOptions, MaxLength};
use unicode_normalization::{IsNormalized, UnicodeNormalization as _, is_nfc_quick, is_nfkc_quick};
use vibrato_rkyv::tokenizer::worker::Worker;
use crate::{
Haqumei, HaqumeiOptions, IuPronunciation, KANALIZER, KANALIZER_CACHE, NjdFeature, OpenJTalk,
Phoneme, ProsodicPhoneme, UnicodeNormalization, VIBRATO_CACHE,
data::MULTI_READ_KANJI_LIST,
errors::HaqumeiError,
features::UnidicFeature,
utils::{
count_mora, is_kanji, is_kanji_feature, is_single_kanji_feature, is_small_kana,
split_kana_mora,
},
};
use utils::{TO_DAKUON, TO_SEION, TO_SEION_CHAR};
impl Haqumei {
#[inline(always)]
pub(crate) fn normalize_unicode_if_needed<'a>(&self, text: &'a str) -> Cow<'a, str> {
match self.options.normalize_unicode {
UnicodeNormalization::None => Cow::Borrowed(text),
UnicodeNormalization::Nfc => {
if is_nfc_quick(text.chars()) == IsNormalized::Yes {
Cow::Borrowed(text)
} else {
Cow::Owned(text.nfc().collect::<String>())
}
}
UnicodeNormalization::Nfkc => {
if is_nfkc_quick(text.chars()) == IsNormalized::Yes {
Cow::Borrowed(text)
} else {
Cow::Owned(text.nfkc().collect::<String>())
}
}
}
}
pub(crate) fn revert_pron_to_read(&self, njd_features: &mut [NjdFeature]) {
let options = &self.options;
debug_assert!(
options.use_read_as_pron || options.revert_long_vowels || options.revert_yotsugana
);
for feature in njd_features.iter_mut() {
let should_revert_to_read = options.use_read_as_pron
|| (options.revert_long_vowels
&& feature.pron.contains('ー')
&& !feature.orig.contains('ー'))
|| (options.revert_yotsugana
&& (feature.read.contains('ヅ') || feature.read.contains('ヂ')));
if should_revert_to_read {
feature.pron = feature.read.clone();
}
}
}
pub(crate) fn normalize_iu(&self, njd_features: &mut [NjdFeature], option: IuPronunciation) {
for f in njd_features.iter_mut() {
let orig = f.orig.as_str();
if matches!(option, IuPronunciation::KanjiIu | IuPronunciation::KanjiYuu)
&& !orig.contains('言')
&& !orig.contains('云')
{
continue;
}
if f.pos == "連体詞" {
if matches!(orig, "こういう" | "そういう" | "どういう" | "ああいう")
{
replace_iu(f, 6..9, option);
}
continue;
}
if orig.starts_with("ていう") || orig.starts_with("という") {
replace_iu(f, 3..6, option);
continue;
}
if orig.starts_with("っていう") || orig.starts_with("とかいう") {
replace_iu(f, 6..9, option);
continue;
}
if orig.starts_with("あっという")
|| orig.starts_with("アッという")
|| orig.starts_with("あっと言う")
|| orig.starts_with("アッと言う")
{
replace_iu(f, 9..12, option);
continue;
}
let is_target_pos = (f.pos == "動詞" && f.pos_group1 == "自立")
|| (f.pos == "形容詞" && f.pos_group1.ends_with("自立") && f.ctype == "形容詞・アウオ段")
|| (f.pos == "副詞" && f.pos_group1 == "一般");
if !is_target_pos {
continue;
}
let orig = f.orig.as_str();
if f.pron == "イウ"
|| orig.starts_with("いう")
|| orig.starts_with("言う")
|| orig.starts_with("云う")
{
replace_iu(f, 0..3, option);
}
else if orig.contains("言う")
&& let Some(pos) = rfind_iu_sound_in_pron(f.pron.as_bytes())
{
replace_iu(f, pos..pos + 3, option);
}
}
}
}
#[inline(always)]
fn replace_iu(njd_feature: &mut NjdFeature, range: Range<usize>, option: IuPronunciation) {
let bytes = unsafe { njd_feature.pron.as_mut_vec() };
if range.end > bytes.len() {
return;
}
debug_assert_eq!(range.end - range.start, 3);
match option {
IuPronunciation::Iu | IuPronunciation::KanjiIu => {
bytes[range].copy_from_slice("イ".as_bytes());
}
IuPronunciation::Yuu | IuPronunciation::KanjiYuu => {
bytes[range].copy_from_slice("ユ".as_bytes());
}
}
}
#[inline(always)]
const fn rfind_iu_sound_in_pron(bytes: &[u8]) -> Option<usize> {
if bytes.len() < 6 {
return None;
}
let mut i = bytes.len() - 6;
loop {
let b1 = bytes[i];
let b2 = bytes[i + 1];
let b3 = bytes[i + 2];
let is_i_or_yu = (b1 == 227) && ((b2 == 130 && b3 == 164) || (b2 == 131 && b3 == 166));
if is_i_or_yu {
let n1 = bytes[i + 3];
let n2 = bytes[i + 4];
let n3 = bytes[i + 5];
let is_target_next = (n1 == 227)
&& (
(n2 == 130 && n3 == 166) || (n2 == 131 && n3 == 131) || (n2 == 130 && n3 == 168) || (n2 == 130 && n3 == 170) || (n2 == 131 && n3 == 188)
);
if is_target_next {
return Some(i);
}
}
if i == 0 {
break;
}
i -= 1;
}
None
}
pub(crate) fn modify_filler_accent(njd_features: &mut [NjdFeature]) {
let mut is_after_filler = false;
for features in njd_features.iter_mut() {
if features.pos == "フィラー" {
if features.acc > features.mora_size {
features.acc = 0;
}
is_after_filler = true;
} else if is_after_filler {
if features.pos == "名詞" {
features.chain_flag = 0;
}
is_after_filler = false;
}
}
}
pub(crate) fn vibrato_analysis(worker: &mut Worker, text: &str) -> Vec<UnidicFeature> {
VIBRATO_CACHE.get_with(text.to_string(), || {
worker.reset_sentence(text);
worker.tokenize();
worker
.token_iter()
.map(|token| {
let token = token.to_buf();
let mut ranges = Vec::with_capacity(29);
let mut start = 0;
for part in token.feature.split(',') {
let end = start + part.len();
ranges.push(start..end);
start = end + 1;
}
UnidicFeature {
surface: token.surface,
feature: token.feature,
range_char: token.range_char,
range_byte: token.range_byte,
lex_type: token.lex_type,
word_id: token.word_id,
left_id: token.left_id,
right_id: token.right_id,
word_cost: token.word_cost,
total_cost: token.total_cost,
feature_ranges: ranges,
}
})
.collect()
})
}
#[rustfmt::skip]
#[inline(always)]
fn should_use_kanalizer(chars: &[char]) -> bool {
if chars.len() == 1 {
return false;
} else if chars.len() == 2 {
return (
english::is_consonant_fullwidth(chars[0])
&& english::is_vowel_fullwidth(chars[1])
) || (
english::is_aeiou_fullwidth(chars[0])
&& english::is_consonant_fullwidth(chars[1])
) || (
english::is_aeiou_fullwidth(chars[0]) && english::is_aeiou_fullwidth(chars[1])
);
} else if chars.len() == 3 {
let has_vowel = chars.iter().any(|&c| english::is_vowel_fullwidth(c));
if !has_vowel {
return (chars[1] == chars[2]) && english::is_continuant_fullwidth(chars[1]);
}
return english::is_pronounceable([chars[0], chars[1], chars[2]]);
}
chars.iter().any(|c| english::is_vowel_fullwidth(*c))
}
pub(crate) fn predict_kana_english(njd_features: &mut Vec<NjdFeature>) {
let mut i = 0;
while i < njd_features.len() {
let is_filler = njd_features[i].pos == "フィラー";
let is_alphabet = njd_features[i].pos_group1 == "アルファベット";
if !is_filler && !is_alphabet {
i += 1;
continue;
}
if njd_features[i]
.string
.chars()
.any(|c| !matches!(c, 'A'..='Z' | 'a'..='z'))
{
i += 1;
continue;
}
let mut end = i + 1;
if is_alphabet {
while end < njd_features.len() && njd_features[end].pos_group1 == "アルファベット"
{
end += 1;
}
if end - i == 1 {
i += 1;
continue;
}
}
if end - i > 1 {
let mut string = String::new();
let mut orig = String::new();
let mut read = String::new();
let mut pron = String::new();
let mut mora_size = 0;
for f in &njd_features[i..end] {
string.push_str(&f.string);
orig.push_str(&f.orig);
read.push_str(&f.read);
pron.push_str(&f.pron);
mora_size += f.mora_size;
}
njd_features[i].string = string;
njd_features[i].orig = orig;
njd_features[i].read = read;
njd_features[i].pron = pron;
njd_features[i].mora_size = mora_size;
njd_features.drain(i + 1..end);
}
let f = &mut njd_features[i];
if let Some(kana) = KANALIZER_CACHE.get(&f.string) {
f.read = kana.clone();
f.pron = kana;
f.acc = 0;
i += 1;
continue;
}
let chars: Vec<char> = f.string.chars().collect();
if should_use_kanalizer(&chars) {
let mut kanalizer = KANALIZER.lock().unwrap();
let options = ConvertOptions {
max_length: MaxLength::Fixed(
std::num::NonZeroUsize::new(f.string.len() * 2).unwrap(),
),
error_on_incomplete: false,
..Default::default()
};
if let Ok(kana) =
kanalizer.convert_with_options(&english::fullwidth_to_halfwidth(chars), &options)
{
KANALIZER_CACHE.insert(f.string.clone(), kana.clone());
f.read = kana.clone();
f.mora_size = count_mora(&kana) as i32;
f.pron = kana;
f.acc = 0;
}
}
i += 1;
}
}
impl Haqumei {
pub(crate) fn predict_nani_reading(&mut self, njd_features: &mut [NjdFeature]) {
for i in 0..njd_features.len() {
if njd_features[i].orig == "何" {
let next_node_feature = njd_features.get(i + 1);
let is_read_nan = self.predict_is_nan(next_node_feature);
let yomi = if is_read_nan { "ナン" } else { "ナニ" };
njd_features[i].pron = yomi.to_string();
njd_features[i].read = yomi.to_string();
}
}
}
pub(crate) fn modify_kanji_yomi(&mut self, text: &str, njd_features: &mut [NjdFeature]) {
let tokens: Vec<UnidicFeature> = if let Some(rx) = self.rx.take() {
rx.recv().unwrap_or_default()
} else {
VIBRATO_CACHE.get(text).unwrap_or_else(|| {
let mut worker = self.tokenizer.as_ref().unwrap().new_worker();
vibrato_analysis(&mut worker, text)
})
};
if tokens.is_empty() {
return;
}
let mut unidic_iter = tokens.into_iter().peekable();
let mut current_char_pos = 0;
for njd_feature in njd_features {
let node_string = &njd_feature.string;
let node_orig = &njd_feature.orig;
let node_char_len = node_string.chars().count();
while let Some(candidate) = unidic_iter.peek() {
if candidate.range_char.end <= current_char_pos {
unidic_iter.next();
} else {
break;
}
}
let mut pron_to_set: Option<String> = None;
let mut read_to_set: Option<String> = None;
if MULTI_READ_KANJI_LIST.contains(node_orig.as_str())
&& let Some(candidate) = unidic_iter.peek()
&& candidate.range_char.start == current_char_pos
&& candidate.surface == *node_orig
{
let correct_yomi_token = unidic_iter.next().unwrap();
let reading = correct_yomi_token.pron();
pron_to_set = Some(reading.to_string());
read_to_set = Some(reading.to_string());
}
if let Some(pron) = pron_to_set {
njd_feature.pron = pron;
}
if let Some(read) = read_to_set {
njd_feature.read = read;
}
current_char_pos += node_char_len;
}
}
}
pub(crate) fn modify_english_words(text: &str, njd_features: &mut [NjdFeature]) {
if njd_features.len() < 2 {
return;
}
#[inline(always)]
fn get_target_word(s: &str) -> Option<&'static str> {
let mut chars = s.chars();
match chars.next()? {
'a' | 'A' | 'a' | 'A' => {
if chars.next().is_none() {
return Some("a");
}
}
'h' | 'H' | 'h' | 'H' => {
if let Some('e' | 'E' | 'e' | 'E') = chars.next()
&& chars.next().is_none()
{
return Some("he");
}
}
's' | 'S' | 's' | 'S' => {
if let Some('h' | 'H' | 'h' | 'H') = chars.next()
&& let Some('e' | 'E' | 'e' | 'E') = chars.next()
&& chars.next().is_none()
{
return Some("she");
}
}
_ => {}
}
None
}
#[inline(always)]
fn is_all_alphabet(s: &str) -> bool {
!s.is_empty()
&& s.chars()
.all(|c| matches!(c, 'A'..='Z' | 'a'..='z' | 'A'..='Z' | 'a'..='z'))
}
let mut text_lower_cache: Option<String> = None;
for i in 0..njd_features.len() {
let curr_orig = &njd_features[i].orig;
if let Some(word) = get_target_word(curr_orig)
&& i + 1 < njd_features.len()
{
let next_orig = &njd_features[i + 1].orig;
if is_all_alphabet(next_orig) {
let next_half = english::to_halfwidth_lower_string(next_orig);
let text_lower = text_lower_cache
.get_or_insert_with(|| english::to_halfwidth_lower_string(text));
let pattern_space = format!("{} {}", word, next_half);
let pattern_full_space = format!("{} {}", word, next_half);
if text_lower.contains(&pattern_space) || text_lower.contains(&pattern_full_space) {
let curr_mut = &mut njd_features[i];
match word {
"a" => {
curr_mut.read = "ア".to_string();
curr_mut.pron = "ア".to_string();
curr_mut.mora_size = 1;
curr_mut.acc = 0; }
"he" => {
curr_mut.read = "ヒー".to_string();
curr_mut.pron = "ヒー".to_string();
curr_mut.mora_size = 2;
curr_mut.acc = 1; }
"she" => {
curr_mut.read = "シー".to_string();
curr_mut.pron = "シー".to_string();
curr_mut.mora_size = 2;
curr_mut.acc = 1; }
_ => unreachable!(),
}
}
}
}
if i > 0 && i < njd_features.len() - 1 {
let curr = &njd_features[i];
let is_two = curr.pos_group1 == "数"
&& curr.pos == "名詞"
&& (curr.orig == "二" || curr.orig == "2" || curr.orig == "2");
if is_two {
let prev_orig = &njd_features[i - 1].orig;
let next_orig = &njd_features[i + 1].orig;
if is_all_alphabet(prev_orig) && is_all_alphabet(next_orig) {
let text_lower = text_lower_cache
.get_or_insert_with(|| english::to_halfwidth_lower_string(text));
let prev_half = english::to_halfwidth_lower_string(prev_orig);
let next_half = english::to_halfwidth_lower_string(next_orig);
let pattern = format!("{}2{}", prev_half, next_half);
if text_lower.contains(&pattern) {
let is_single_both =
prev_orig.chars().count() == 1 && next_orig.chars().count() == 1;
let curr_mut = &mut njd_features[i];
curr_mut.string = "2".to_string();
curr_mut.orig = "2".to_string();
if is_single_both {
curr_mut.read = "ツー".to_string();
curr_mut.pron = "ツー".to_string();
} else {
curr_mut.read = "トゥー".to_string();
curr_mut.pron = "トゥー".to_string();
}
curr_mut.mora_size = 2;
curr_mut.acc = 1; }
}
}
}
}
}
pub(crate) fn retreat_acc_nuc(njd_features: &mut [NjdFeature]) {
if njd_features.is_empty() {
return;
}
const INAPPROPRIATE_FOR_NUCLEAR_CHARS: &[char] = &['ー', 'ッ', 'ン'];
let mut head_index = 0;
let mut acc = 0;
for i in 0..njd_features.len() {
if njd_features[i].chain_flag == 0 || njd_features[i].chain_flag == -1 {
head_index = i;
acc = njd_features[head_index].acc;
}
const YOUON_CHARS: &[char] = &['ャ', 'ュ', 'ョ', 'ァ', 'ィ', 'ゥ', 'ェ', 'ォ'];
let pron_without_youon: String = njd_features[i]
.pron
.chars()
.filter(|c| !YOUON_CHARS.contains(c))
.collect();
let pron_ref = if pron_without_youon.is_empty() {
&njd_features[i].pron
} else {
&pron_without_youon
};
if acc > 0 {
if acc <= njd_features[i].mora_size {
if pron_ref
.chars()
.nth((acc - 1) as usize)
.or(pron_ref.chars().next())
.is_some_and(|nuc_pron| INAPPROPRIATE_FOR_NUCLEAR_CHARS.contains(&nuc_pron))
{
njd_features[head_index].acc = njd_features[head_index].acc.saturating_sub(1);
}
acc = -1;
} else {
acc -= njd_features[i].mora_size;
}
}
}
}
pub(crate) fn modify_acc_after_chaining(njd_features: &mut [NjdFeature]) {
if njd_features.is_empty() {
return;
}
const SUFFIXES_TO_MODIFY_ACC: &[&str] = &["れる", "られる", "すぎる", "せる", "させる"];
let mut head_index = 0;
let mut acc = 0;
let mut is_after_nuc = false;
let mut phase_len = 0;
for i in 0..njd_features.len() {
if njd_features[i].chain_flag == 0 || njd_features[i].chain_flag == -1 {
is_after_nuc = false;
head_index = i;
acc = njd_features[head_index].acc;
phase_len = 0;
}
if acc == 0 {
continue;
}
let mora_size = njd_features[i].mora_size;
if is_after_nuc {
let njd = &njd_features[i];
if njd.ctype == "特殊・マス" {
njd_features[head_index].acc = if njd.cform != "未然形" {
phase_len + 1
} else {
phase_len + 2
};
} else if njd.ctype == "特殊・ナイ" {
njd_features[head_index].acc = phase_len;
} else if SUFFIXES_TO_MODIFY_ACC.contains(&njd.orig.as_str()) {
njd_features[head_index].acc = phase_len + njd.acc;
} else {
is_after_nuc = false;
acc = 0;
}
phase_len += mora_size;
} else {
phase_len += mora_size;
if acc <= mora_size {
is_after_nuc = true;
} else {
acc -= mora_size;
}
}
}
}
#[inline(always)]
fn set_to_noun(feat: &mut NjdFeature) {
feat.pos = "名詞".to_string();
feat.pos_group1 = "一般".to_string();
feat.pos_group2 = "*".to_string();
feat.pos_group3 = "*".to_string();
feat.ctype = "*".to_string();
feat.cform = "*".to_string();
}
fn detect_odori_unit(read: &str) -> Option<usize> {
let seion_read: String = read
.chars()
.map(|ch| {
if is_dakuon(ch) {
TO_SEION_CHAR.get(&ch).copied().unwrap_or(ch)
} else {
ch
}
})
.collect();
let moras = split_kana_mora(&seion_read);
let n = moras.len();
if n < 2 {
return None;
}
for len in 1..=(n / 2) {
let first_half = &moras[n - len * 2..n - len];
let second_half = &moras[n - len..n];
if first_half == second_half {
return Some(len);
}
}
None
}
#[rustfmt::skip]
pub(crate) fn is_dakuon(c: char) -> bool {
matches!(
c,
'が' | 'ぎ' | 'ぐ' | 'げ' | 'ご'
| 'ざ' | 'じ' | 'ず' | 'ぜ' | 'ぞ'
| 'だ' | 'ぢ' | 'づ' | 'で' | 'ど'
| 'ば' | 'び' | 'ぶ' | 'べ' | 'ぼ'
| 'ガ' | 'ギ' | 'グ' | 'ゲ' | 'ゴ'
| 'ザ' | 'ジ' | 'ズ' | 'ゼ' | 'ゾ'
| 'ダ' | 'ヂ' | 'ヅ' | 'デ' | 'ド'
| 'バ' | 'ビ' | 'ブ' | 'ベ' | 'ボ'
| 'ヴ'
)
}
pub(crate) fn process_odori_features(
njd_features: &mut Vec<NjdFeature>,
open_jtalk: &mut OpenJTalk,
) -> Result<(), HaqumeiError> {
let mut i = 0;
while i < njd_features.len() {
let orig = &njd_features[i].orig;
if is_dounojiten(orig) {
let mut reanalysis_result = None;
if i > 0 {
let prev = &njd_features[i - 1];
if count_dounojiten(orig) == 1 && is_kanji_feature(prev) {
let prev_chars: Vec<char> = prev.orig.chars().collect();
if prev_chars.len() > 1 {
let last_char = *prev_chars.last().unwrap();
if is_kanji(last_char) {
let next_token_opt = if i + 1 < njd_features.len() {
Some(&njd_features[i + 1])
} else {
None
};
let (target_text, consumed_next) = if let Some(next) = next_token_opt {
if is_single_kanji_feature(next) {
(format!("{}{}", last_char, next.orig), true)
} else {
(last_char.to_string(), false)
}
} else {
(last_char.to_string(), false)
};
reanalysis_result = Some((target_text, consumed_next));
}
}
}
}
if let Some((text, consumed_next)) = reanalysis_result {
let mut analyzed = open_jtalk.run_frontend(&text)?;
if let Some(first) = analyzed.get_mut(0) {
first.chain_flag = 1;
}
let range_end = if consumed_next { i + 2 } else { i + 1 };
let analyzed_len = analyzed.len();
if range_end <= njd_features.len() {
njd_features.splice(i..range_end, analyzed);
if !consumed_next && analyzed_len > 0 {
set_to_noun(&mut njd_features[i]);
i += 1;
} else {
i += analyzed_len;
}
continue;
}
}
let start = i;
let mut end = i;
let mut total_odori = 0;
while end < njd_features.len() && is_dounojiten(&njd_features[end].orig) {
total_odori += count_dounojiten(&njd_features[end].orig);
end += 1;
}
if i > 0 && njd_features[i - 1].orig.ends_with('々') {
let prev = &njd_features[i - 1];
let base_acc = prev.acc;
if let Some(period) = detect_odori_unit(&prev.read) {
let raw_read_moras = split_kana_mora(&prev.read);
let raw_pron_moras = split_kana_mora(&prev.pron);
if raw_read_moras.len() >= period {
let unit_read = raw_read_moras[raw_read_moras.len() - period..].join("");
let unit_pron = raw_pron_moras[raw_pron_moras.len() - period..].join("");
let unit_mora =
(prev.mora_size / raw_read_moras.len() as i32) * period as i32;
let current_feat = &mut njd_features[i];
let count = count_dounojiten(¤t_feat.orig);
current_feat.read = unit_read.repeat(count);
current_feat.pron = unit_pron.repeat(count);
current_feat.mora_size = unit_mora * count as i32;
current_feat.acc = base_acc;
current_feat.chain_flag = 1;
if current_feat.pos == "記号" {
set_to_noun(current_feat);
}
i += 1;
continue;
}
}
}
let mut normal_indices = Vec::new();
let mut j = start;
let mut collected_chars = 0;
let needed_chars = total_odori.min(8);
while j > 0 {
j -= 1;
let target = &njd_features[j];
if matches!(target.pos.as_str(), "記号" | "フィラー" | "感動詞") {
break;
}
if is_kanji_feature(target) {
normal_indices.push(j);
collected_chars += target.orig.chars().count();
if collected_chars >= needed_chars {
break;
}
} else {
break;
}
}
normal_indices.reverse();
if normal_indices.is_empty() {
i = end;
continue;
}
let base_acc = njd_features[normal_indices[0]].acc;
let is_single_kanji = normal_indices.len() == 1
&& njd_features[normal_indices[0]].orig.chars().count() == 1;
let (base_read, base_pron, base_mora_size) = if is_single_kanji {
let f = &njd_features[normal_indices[0]];
(f.read.clone(), f.pron.clone(), f.mora_size)
} else {
let mut r = String::new();
let mut p = String::new();
let mut m = 0;
for &idx in &normal_indices {
r.push_str(&njd_features[idx].read);
p.push_str(&njd_features[idx].pron);
m += njd_features[idx].mora_size;
}
(r, p, m)
};
for mut njd_feature in njd_features.iter_mut().take(end).skip(start) {
let current_odori = count_dounojiten(&njd_feature.orig);
let feat = &mut njd_feature;
if is_single_kanji {
feat.read = base_read.repeat(current_odori);
feat.pron = base_pron.repeat(current_odori);
feat.mora_size = base_mora_size * current_odori as i32;
} else {
feat.read = base_read.clone();
feat.pron = base_pron.clone();
feat.mora_size = base_mora_size;
}
feat.acc = base_acc; feat.chain_flag = 1;
if feat.pos == "記号" {
set_to_noun(feat);
}
}
i = end;
} else if is_ichinojiten(orig) {
if i > 0 {
if njd_features[i - 1].pos != "記号" {
let mut prev_index = None;
let mut k = i;
while k > 0 {
k -= 1;
if njd_features[k].pos != "記号" && njd_features[k].mora_size > 0 {
prev_index = Some(k);
break;
}
}
if let Some(pidx) = prev_index {
let prev_read = njd_features[pidx].read.clone();
let prev_pron = njd_features[pidx].pron.clone();
let prev_mora_size = njd_features[pidx].mora_size;
let curr = &mut njd_features[i];
apply_odoriji_logic(curr, &prev_read, &prev_pron, prev_mora_size);
}
}
}
i += 1;
} else {
i += 1;
}
}
Ok(())
}
#[inline(always)]
fn is_dounojiten(orig: &str) -> bool {
!orig.is_empty() && orig.chars().all(|c| c == '々')
}
#[inline(always)]
fn is_ichinojiten(orig: &str) -> bool {
!orig.is_empty() && orig.chars().all(|c| matches!(c, 'ゝ' | 'ゞ' | 'ヽ' | 'ヾ'))
}
#[inline(always)]
fn count_dounojiten(orig: &str) -> usize {
orig.chars().filter(|&c| c == '々').count()
}
fn apply_odoriji_logic(
odori_feature: &mut NjdFeature,
prev_read: &str,
prev_pron: &str,
prev_mora_size: i32,
) {
let prev_read_mora = split_kana_mora(prev_read);
let prev_pron_source = if prev_pron.contains('’') {
Cow::Owned(prev_pron.replace('’', ""))
} else {
Cow::Borrowed(prev_pron)
};
let prev_pron_source = if prev_pron_source.is_empty() {
prev_read
} else {
prev_pron_source.as_ref()
};
let prev_pron_mora = split_kana_mora(prev_pron_source);
if prev_read_mora.is_empty() {
return;
}
let mora_val = prev_mora_size / prev_read_mora.len() as i32;
let target_read = prev_read_mora.last().unwrap();
let target_pron = prev_pron_mora.last().unwrap_or(target_read);
let mut is_forced_voiced = false;
for c in odori_feature.orig.chars().peekable() {
if matches!(c, 'ゞ' | 'ヾ') {
is_forced_voiced = true;
break;
}
if matches!(c, 'ゝ' | 'ヽ') {
break;
}
}
let is_single_grapheme_mora = {
let mut chars = target_read.chars();
!chars.any(is_small_kana)
};
if is_forced_voiced {
odori_feature.read = TO_DAKUON
.get(target_read)
.copied()
.unwrap_or(target_read)
.to_string();
odori_feature.pron = TO_DAKUON
.get(target_pron)
.copied()
.unwrap_or(target_pron)
.to_string();
} else {
if is_single_grapheme_mora {
odori_feature.read = TO_SEION
.get(target_read)
.copied()
.unwrap_or(target_read)
.to_string();
odori_feature.pron = TO_SEION
.get(target_pron)
.copied()
.unwrap_or(target_pron)
.to_string();
} else {
odori_feature.read = target_read.to_string();
odori_feature.pron = target_pron.to_string();
}
}
odori_feature.mora_size = mora_val;
if odori_feature.pos == "記号" {
set_to_noun(odori_feature);
}
}
pub(crate) fn apply_allophones<'a, I>(phonemes: I, options: &HaqumeiOptions)
where
I: IntoIterator<Item = &'a mut Phoneme>,
{
let split_n = options.split_n_allophones || options.use_allophones;
let split_n_r = options.split_n_before_r;
let split_n_pa = options.split_n_before_palatal_affricate;
let split_q = options.split_q_allophones || options.use_allophones;
let final_glottal = options.enable_final_glottal_stop || options.use_allophones;
if !split_n && !split_q && !final_glottal {
return;
}
let mut iter = phonemes.into_iter();
let mut pending_ref = match iter.next() {
Some(p) => p,
None => return,
};
for current_ref in iter {
let next_phoneme = *current_ref;
let mut resolved = *pending_ref;
resolved = resolved.resolve_q_final_glottal_stop(Some(next_phoneme), final_glottal);
resolved = resolved.resolve_q_allophone(Some(next_phoneme), split_q);
resolved = resolved.resolve_n_allophone(Some(next_phoneme), split_n, split_n_r, split_n_pa);
*pending_ref = resolved;
pending_ref = current_ref;
}
let mut resolved = *pending_ref;
resolved = resolved.resolve_q_final_glottal_stop(None, final_glottal);
resolved = resolved.resolve_q_allophone(None, split_q);
resolved = resolved.resolve_n_allophone(None, split_n, split_n_r, split_n_pa);
*pending_ref = resolved;
}
pub(crate) fn apply_allophones_to_prosody<'a, I>(phonemes: I, options: &HaqumeiOptions)
where
I: IntoIterator<Item = &'a mut ProsodicPhoneme>,
{
let split_n = options.split_n_allophones || options.use_allophones;
let split_n_r = options.split_n_before_r;
let split_n_pa = options.split_n_before_palatal_affricate;
let split_q = options.split_q_allophones || options.use_allophones;
let final_glottal = options.enable_final_glottal_stop || options.use_allophones;
if !split_n && !split_q && !final_glottal {
return;
}
let mut pending_target: Option<&mut Phoneme> = None;
for prosodic_phoneme in phonemes.into_iter() {
match prosodic_phoneme {
ProsodicPhoneme::Phoneme { phoneme, .. } => {
let current_val = *phoneme;
if let Some(target) = pending_target.take() {
let mut resolved = *target;
resolved =
resolved.resolve_q_final_glottal_stop(Some(current_val), final_glottal);
resolved = resolved.resolve_q_allophone(Some(current_val), split_q);
resolved = resolved.resolve_n_allophone(
Some(current_val),
split_n,
split_n_r,
split_n_pa,
);
*target = resolved;
}
pending_target = Some(phoneme);
}
ProsodicPhoneme::Pause
| ProsodicPhoneme::Interrogative
| ProsodicPhoneme::Exclamatory => {
if let Some(target) = pending_target.take() {
let mut resolved = *target;
resolved =
resolved.resolve_q_final_glottal_stop(Some(Phoneme::Pau), final_glottal);
resolved = resolved.resolve_q_allophone(Some(Phoneme::Pau), split_q);
resolved = resolved.resolve_n_allophone(
Some(Phoneme::Pau),
split_n,
split_n_r,
split_n_pa,
);
*target = resolved;
}
}
ProsodicPhoneme::AccentPhraseBoundary => {}
}
}
if let Some(target) = pending_target.take() {
let mut resolved = *target;
resolved = resolved.resolve_q_final_glottal_stop(None, final_glottal);
resolved = resolved.resolve_q_allophone(None, split_q);
resolved = resolved.resolve_n_allophone(None, split_n, split_n_r, split_n_pa);
*target = resolved;
}
}