mod utils;
use std::borrow::Cow;
use unicode_normalization::{IsNormalized, UnicodeNormalization as _, is_nfc_quick, is_nfkc_quick};
use vibrato_rkyv::tokenizer::worker::Worker;
use crate::{
Haqumei, NjdFeature, OpenJTalk, UnicodeNormalization, VIBRATO_CACHE,
data::MULTI_READ_KANJI_LIST,
errors::HaqumeiError,
features::UnidicFeature,
utils::{
is_dakuon, is_kanji, is_kanji_feature, is_single_kanji_feature, is_small_kana,
split_kana_mora,
},
};
use utils::{TO_DAKUON, TO_SEION, TO_SEION_CHAR};
impl Haqumei {
#[inline(always)]
pub(crate) fn normalize_unicode_if_needed<'a>(&self, text: &'a str) -> Cow<'a, str> {
match self.options.normalize_unicode {
UnicodeNormalization::None => Cow::Borrowed(text),
UnicodeNormalization::Nfc => {
if is_nfc_quick(text.chars()) == IsNormalized::Yes {
Cow::Borrowed(text)
} else {
Cow::Owned(text.nfc().collect::<String>())
}
}
UnicodeNormalization::Nfkc => {
if is_nfkc_quick(text.chars()) == IsNormalized::Yes {
Cow::Borrowed(text)
} else {
Cow::Owned(text.nfkc().collect::<String>())
}
}
}
}
pub(crate) fn revert_pron_to_read(&mut self, njd_features: &mut [NjdFeature]) {
let options = &self.options;
debug_assert!(
options.use_read_as_pron || options.revert_long_vowels || options.revert_yotsugana
);
for feature in njd_features.iter_mut() {
let should_revert_to_read = options.use_read_as_pron
|| (options.revert_long_vowels
&& feature.pron.contains('ー')
&& !feature.orig.contains('ー'))
|| (options.revert_yotsugana
&& (feature.read.contains('ヅ') || feature.read.contains('ヂ')));
if should_revert_to_read {
feature.pron = feature.read.clone();
}
}
}
pub(crate) fn predict_nani_reading(&mut self, njd_features: &mut [NjdFeature]) {
for i in 0..njd_features.len() {
if njd_features[i].orig == "何" {
let next_node_feature = njd_features.get(i + 1);
let is_read_nan = self.predict_is_nan(next_node_feature);
let yomi = if is_read_nan { "ナン" } else { "ナニ" };
njd_features[i].pron = yomi.to_string();
njd_features[i].read = yomi.to_string();
}
}
}
}
pub(crate) fn modify_filler_accent(njd_features: &mut [NjdFeature]) {
let mut is_after_filler = false;
for features in njd_features.iter_mut() {
if features.pos == "フィラー" {
if features.acc > features.mora_size {
features.acc = 0;
}
is_after_filler = true;
} else if is_after_filler {
if features.pos == "名詞" {
features.chain_flag = 0;
}
is_after_filler = false;
}
}
}
pub(crate) fn vibrato_analysis(worker: &mut Worker, text: &str) -> Vec<UnidicFeature> {
VIBRATO_CACHE.get_with(text.to_string(), || {
worker.reset_sentence(text);
worker.tokenize();
worker
.token_iter()
.map(|token| {
let token = token.to_buf();
let mut ranges = Vec::with_capacity(29);
let mut start = 0;
for part in token.feature.split(',') {
let end = start + part.len();
ranges.push(start..end);
start = end + 1;
}
UnidicFeature {
surface: token.surface,
feature: token.feature,
range_char: token.range_char,
range_byte: token.range_byte,
lex_type: token.lex_type,
word_id: token.word_id,
left_id: token.left_id,
right_id: token.right_id,
word_cost: token.word_cost,
total_cost: token.total_cost,
feature_ranges: ranges,
}
})
.collect()
})
}
impl Haqumei {
pub(crate) fn modify_kanji_yomi(&mut self, text: &str, njd_features: &mut [NjdFeature]) {
let tokens: Vec<UnidicFeature> = if let Some(rx) = self.rx.take() {
rx.recv().unwrap_or_default()
} else {
VIBRATO_CACHE.get(text).unwrap_or_else(|| {
let mut worker = self.tokenizer.as_ref().unwrap().new_worker();
vibrato_analysis(&mut worker, text)
})
};
if tokens.is_empty() {
return;
}
let mut unidic_iter = tokens.into_iter().peekable();
let mut current_char_pos = 0;
for njd_feature in njd_features {
let node_string = &njd_feature.string;
let node_orig = &njd_feature.orig;
let node_char_len = node_string.chars().count();
while let Some(candidate) = unidic_iter.peek() {
if candidate.range_char.end <= current_char_pos {
unidic_iter.next();
} else {
break;
}
}
let mut pron_to_set: Option<String> = None;
let mut read_to_set: Option<String> = None;
if MULTI_READ_KANJI_LIST.contains(node_orig.as_str())
&& let Some(candidate) = unidic_iter.peek()
&& candidate.range_char.start == current_char_pos
&& candidate.surface == *node_orig
{
let correct_yomi_token = unidic_iter.next().unwrap();
let reading = correct_yomi_token.pron();
pron_to_set = Some(reading.to_string());
read_to_set = Some(reading.to_string());
}
if let Some(pron) = pron_to_set {
njd_feature.pron = pron;
}
if let Some(read) = read_to_set {
njd_feature.read = read;
}
current_char_pos += node_char_len;
}
}
}
pub(crate) fn retreat_acc_nuc(njd_features: &mut [NjdFeature]) {
if njd_features.is_empty() {
return;
}
const INAPPROPRIATE_FOR_NUCLEAR_CHARS: &[char] = &['ー', 'ッ', 'ン'];
let mut head_index = 0;
let mut acc = 0;
for i in 0..njd_features.len() {
if njd_features[i].chain_flag == 0 || njd_features[i].chain_flag == -1 {
head_index = i;
acc = njd_features[head_index].acc;
}
const YOUON_CHARS: &[char] = &['ャ', 'ュ', 'ョ', 'ァ', 'ィ', 'ゥ', 'ェ', 'ォ'];
let pron_without_youon: String = njd_features[i]
.pron
.chars()
.filter(|c| !YOUON_CHARS.contains(c))
.collect();
let pron_ref = if pron_without_youon.is_empty() {
&njd_features[i].pron
} else {
&pron_without_youon
};
if acc > 0 {
if acc <= njd_features[i].mora_size {
if pron_ref
.chars()
.nth((acc - 1) as usize)
.or(pron_ref.chars().next())
.is_some_and(|nuc_pron| INAPPROPRIATE_FOR_NUCLEAR_CHARS.contains(&nuc_pron))
{
njd_features[head_index].acc = njd_features[head_index].acc.saturating_sub(1);
}
acc = -1;
} else {
acc -= njd_features[i].mora_size;
}
}
}
}
pub(crate) fn modify_acc_after_chaining(njd_features: &mut [NjdFeature]) {
if njd_features.is_empty() {
return;
}
const SUFFIXES_TO_MODIFY_ACC: &[&str] = &["れる", "られる", "すぎる", "せる", "させる"];
let mut head_index = 0;
let mut acc = 0;
let mut is_after_nuc = false;
let mut phase_len = 0;
for i in 0..njd_features.len() {
if njd_features[i].chain_flag == 0 || njd_features[i].chain_flag == -1 {
is_after_nuc = false;
head_index = i;
acc = njd_features[head_index].acc;
phase_len = 0;
}
if acc == 0 {
continue;
}
let mora_size = njd_features[i].mora_size;
if is_after_nuc {
let njd = &njd_features[i];
if njd.ctype == "特殊・マス" {
njd_features[head_index].acc = if njd.cform != "未然形" {
phase_len + 1
} else {
phase_len + 2
};
} else if njd.ctype == "特殊・ナイ" {
njd_features[head_index].acc = phase_len;
} else if SUFFIXES_TO_MODIFY_ACC.contains(&njd.orig.as_str()) {
njd_features[head_index].acc = phase_len + njd.acc;
} else {
is_after_nuc = false;
acc = 0;
}
phase_len += mora_size;
} else {
phase_len += mora_size;
if acc <= mora_size {
is_after_nuc = true;
} else {
acc -= mora_size;
}
}
}
}
#[inline(always)]
fn set_to_noun(feat: &mut NjdFeature) {
feat.pos = "名詞".to_string();
feat.pos_group1 = "一般".to_string();
feat.pos_group2 = "*".to_string();
feat.pos_group3 = "*".to_string();
feat.ctype = "*".to_string();
feat.cform = "*".to_string();
}
fn detect_odori_unit(read: &str) -> Option<usize> {
let seion_read: String = read
.chars()
.map(|ch| {
if is_dakuon(ch) {
TO_SEION_CHAR.get(&ch).copied().unwrap_or(ch)
} else {
ch
}
})
.collect();
let moras = split_kana_mora(&seion_read);
let n = moras.len();
if n < 2 {
return None;
}
for len in 1..=(n / 2) {
let first_half = &moras[n - len * 2..n - len];
let second_half = &moras[n - len..n];
if first_half == second_half {
return Some(len);
}
}
None
}
pub(crate) fn process_odori_features(
njd_features: &mut Vec<NjdFeature>,
open_jtalk: &mut OpenJTalk,
) -> Result<(), HaqumeiError> {
let mut i = 0;
while i < njd_features.len() {
let orig = &njd_features[i].orig;
if is_dounojiten(orig) {
let mut reanalysis_result = None;
if i > 0 {
let prev = &njd_features[i - 1];
if count_dounojiten(orig) == 1 && is_kanji_feature(prev) {
let prev_chars: Vec<char> = prev.orig.chars().collect();
if prev_chars.len() > 1 {
let last_char = *prev_chars.last().unwrap();
if is_kanji(last_char) {
let next_token_opt = if i + 1 < njd_features.len() {
Some(&njd_features[i + 1])
} else {
None
};
let (target_text, consumed_next) = if let Some(next) = next_token_opt {
if is_single_kanji_feature(next) {
(format!("{}{}", last_char, next.orig), true)
} else {
(last_char.to_string(), false)
}
} else {
(last_char.to_string(), false)
};
reanalysis_result = Some((target_text, consumed_next));
}
}
}
}
if let Some((text, consumed_next)) = reanalysis_result {
let mut analyzed = open_jtalk.run_frontend(&text)?;
if let Some(first) = analyzed.get_mut(0) {
first.chain_flag = 1;
}
let range_end = if consumed_next { i + 2 } else { i + 1 };
let analyzed_len = analyzed.len();
if range_end <= njd_features.len() {
njd_features.splice(i..range_end, analyzed);
if !consumed_next && analyzed_len > 0 {
set_to_noun(&mut njd_features[i]);
i += 1;
} else {
i += analyzed_len;
}
continue;
}
}
let start = i;
let mut end = i;
let mut total_odori = 0;
while end < njd_features.len() && is_dounojiten(&njd_features[end].orig) {
total_odori += count_dounojiten(&njd_features[end].orig);
end += 1;
}
if i > 0 && njd_features[i - 1].orig.ends_with('々') {
let prev = &njd_features[i - 1];
let base_acc = prev.acc;
if let Some(period) = detect_odori_unit(&prev.read) {
let raw_read_moras = split_kana_mora(&prev.read);
let raw_pron_moras = split_kana_mora(&prev.pron);
if raw_read_moras.len() >= period {
let unit_read = raw_read_moras[raw_read_moras.len() - period..].join("");
let unit_pron = raw_pron_moras[raw_pron_moras.len() - period..].join("");
let unit_mora =
(prev.mora_size / raw_read_moras.len() as i32) * period as i32;
let current_feat = &mut njd_features[i];
let count = count_dounojiten(¤t_feat.orig);
current_feat.read = unit_read.repeat(count);
current_feat.pron = unit_pron.repeat(count);
current_feat.mora_size = unit_mora * count as i32;
current_feat.acc = base_acc;
current_feat.chain_flag = 1;
if current_feat.pos == "記号" {
set_to_noun(current_feat);
}
i += 1;
continue;
}
}
}
let mut normal_indices = Vec::new();
let mut j = start;
let mut collected_chars = 0;
let needed_chars = total_odori.min(8);
while j > 0 {
j -= 1;
let target = &njd_features[j];
if matches!(target.pos.as_str(), "記号" | "フィラー" | "感動詞") {
break;
}
if is_kanji_feature(target) {
normal_indices.push(j);
collected_chars += target.orig.chars().count();
if collected_chars >= needed_chars {
break;
}
} else {
break;
}
}
normal_indices.reverse();
if normal_indices.is_empty() {
i = end;
continue;
}
let base_acc = njd_features[normal_indices[0]].acc;
let is_single_kanji = normal_indices.len() == 1
&& njd_features[normal_indices[0]].orig.chars().count() == 1;
let (base_read, base_pron, base_mora_size) = if is_single_kanji {
let f = &njd_features[normal_indices[0]];
(f.read.clone(), f.pron.clone(), f.mora_size)
} else {
let mut r = String::new();
let mut p = String::new();
let mut m = 0;
for &idx in &normal_indices {
r.push_str(&njd_features[idx].read);
p.push_str(&njd_features[idx].pron);
m += njd_features[idx].mora_size;
}
(r, p, m)
};
for mut njd_feature in njd_features.iter_mut().take(end).skip(start) {
let current_odori = count_dounojiten(&njd_feature.orig);
let feat = &mut njd_feature;
if is_single_kanji {
feat.read = base_read.repeat(current_odori);
feat.pron = base_pron.repeat(current_odori);
feat.mora_size = base_mora_size * current_odori as i32;
} else {
feat.read = base_read.clone();
feat.pron = base_pron.clone();
feat.mora_size = base_mora_size;
}
feat.acc = base_acc; feat.chain_flag = 1;
if feat.pos == "記号" {
set_to_noun(feat);
}
}
i = end;
} else if is_ichinojiten(orig) {
if i > 0 {
if njd_features[i - 1].pos != "記号" {
let mut prev_index = None;
let mut k = i;
while k > 0 {
k -= 1;
if njd_features[k].pos != "記号" && njd_features[k].mora_size > 0 {
prev_index = Some(k);
break;
}
}
if let Some(pidx) = prev_index {
let prev_read = njd_features[pidx].read.clone();
let prev_pron = njd_features[pidx].pron.clone();
let prev_mora_size = njd_features[pidx].mora_size;
let curr = &mut njd_features[i];
apply_odoriji_logic(curr, &prev_read, &prev_pron, prev_mora_size);
}
}
}
i += 1;
} else {
i += 1;
}
}
Ok(())
}
#[inline(always)]
fn is_dounojiten(orig: &str) -> bool {
!orig.is_empty() && orig.chars().all(|c| c == '々')
}
#[inline(always)]
fn is_ichinojiten(orig: &str) -> bool {
!orig.is_empty() && orig.chars().all(|c| matches!(c, 'ゝ' | 'ゞ' | 'ヽ' | 'ヾ'))
}
#[inline(always)]
fn count_dounojiten(orig: &str) -> usize {
orig.chars().filter(|&c| c == '々').count()
}
fn apply_odoriji_logic(
odori_feature: &mut NjdFeature,
prev_read: &str,
prev_pron: &str,
prev_mora_size: i32,
) {
let prev_read_mora = split_kana_mora(prev_read);
let prev_pron_source = if prev_pron.contains('’') {
Cow::Owned(prev_pron.replace('’', ""))
} else {
Cow::Borrowed(prev_pron)
};
let prev_pron_source = if prev_pron_source.is_empty() {
prev_read
} else {
prev_pron_source.as_ref()
};
let prev_pron_mora = split_kana_mora(prev_pron_source);
if prev_read_mora.is_empty() {
return;
}
let mora_val = prev_mora_size / prev_read_mora.len() as i32;
let target_read = prev_read_mora.last().unwrap().clone();
let target_pron = prev_pron_mora.last().unwrap_or(&target_read).clone();
let mut is_forced_voiced = false;
for c in odori_feature.orig.chars().peekable() {
if matches!(c, 'ゞ' | 'ヾ') {
is_forced_voiced = true;
break;
}
if matches!(c, 'ゝ' | 'ヽ') {
break;
}
}
let is_single_grapheme_mora = {
let mut chars = target_read.chars();
!chars.any(is_small_kana)
};
if is_forced_voiced {
odori_feature.read = TO_DAKUON
.get(&target_read)
.copied()
.unwrap_or(&target_read)
.to_string();
odori_feature.pron = TO_DAKUON
.get(&target_pron)
.copied()
.unwrap_or(&target_pron)
.to_string();
} else {
if is_single_grapheme_mora {
odori_feature.read = TO_SEION
.get(&target_read)
.copied()
.unwrap_or(&target_read)
.to_string();
odori_feature.pron = TO_SEION
.get(&target_pron)
.copied()
.unwrap_or(&target_pron)
.to_string();
} else {
odori_feature.read = target_read.to_string();
odori_feature.pron = target_pron.to_string();
}
}
odori_feature.mora_size = mora_val;
if odori_feature.pos == "記号" {
set_to_noun(odori_feature);
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_modify_acc_after_chaining_mut() {
let mut features = [
NjdFeature {
string: "参り".to_string(),
pos: "動詞".to_string(),
pos_group1: "自立".to_string(),
pos_group2: "*".to_string(),
pos_group3: "*".to_string(),
ctype: "五段・ラ行".to_string(),
cform: "連用形".to_string(),
orig: "参る".to_string(),
read: "マイリ".to_string(),
pron: "マイリ".to_string(),
acc: 1,
mora_size: 3,
chain_rule: "*".to_string(),
chain_flag: -1,
},
NjdFeature {
string: "ます".to_string(),
pos: "助動詞".to_string(),
pos_group1: "*".to_string(),
pos_group2: "*".to_string(),
pos_group3: "*".to_string(),
ctype: "特殊・マス".to_string(),
cform: "基本形".to_string(),
orig: "ます".to_string(),
read: "マス".to_string(),
pron: "マス’".to_string(),
acc: 1,
mora_size: 2,
chain_rule: "動詞%F2@1/助詞%F2@1".to_string(),
chain_flag: 1,
},
];
modify_acc_after_chaining(&mut features);
let 参り = features.first().unwrap();
assert_eq!(参り.acc, 4);
}
}