use std::borrow::Cow;
use wana_kana::{ConvertJapanese, IsJapaneseStr, Options};
use super::{Normalizer, NormalizerOption};
use crate::detection::{Language, Script};
use crate::Token;
pub struct JapaneseNormalizer;
impl Normalizer for JapaneseNormalizer {
fn normalize<'o>(&self, mut token: Token<'o>, _options: &NormalizerOption) -> Token<'o> {
let dst = token.lemma().to_hiragana_with_opt(Options {
pass_romaji: true, ..Default::default()
});
token.lemma = Cow::Owned(dst);
token
}
fn should_normalize(&self, token: &Token) -> bool {
token.script == Script::Cj
&& matches!(token.language, None | Some(Language::Jpn))
&& !token.lemma().is_hiragana()
}
}
#[cfg(test)]
mod test {
use std::borrow::Cow::Owned;
use crate::normalizer::test::test_normalizer;
use crate::token::TokenKind;
fn tokens() -> Vec<Token<'static>> {
vec![
Token {
lemma: Owned("だめ".to_string()),
char_end: 2,
byte_end: 6,
script: Script::Cj,
language: Some(Language::Jpn),
..Default::default()
},
Token {
lemma: Owned("だめ".to_string()),
char_end: 2,
byte_end: 6,
char_map: Some(vec![(3, 3), (3, 3)]),
script: Script::Cj,
language: Some(Language::Jpn),
..Default::default()
},
Token {
lemma: Owned("ダメ駄目だめHi".to_string()),
char_end: 8,
byte_end: 20,
script: Script::Cj,
language: Some(Language::Jpn),
..Default::default()
},
]
}
fn normalizer_result() -> Vec<Token<'static>> {
vec![
Token {
lemma: Owned("だめ".to_string()),
char_end: 2,
byte_end: 6,
script: Script::Cj,
language: Some(Language::Jpn),
..Default::default()
},
Token {
lemma: Owned("だめ".to_string()),
char_end: 2,
byte_end: 6,
char_map: Some(vec![(3, 3), (3, 3)]),
script: Script::Cj,
language: Some(Language::Jpn),
..Default::default()
},
Token {
lemma: Owned("だめ駄目だめHi".to_string()),
char_end: 8,
byte_end: 20,
script: Script::Cj,
language: Some(Language::Jpn),
..Default::default()
},
]
}
fn normalized_tokens() -> Vec<Token<'static>> {
vec![
Token {
lemma: Owned("た\u{3099}め".to_string()),
char_end: 2,
byte_end: 6,
char_map: Some(vec![(3, 6), (3, 3)]),
script: Script::Cj,
language: Some(Language::Jpn),
kind: TokenKind::Word,
..Default::default()
},
Token {
lemma: Owned("た\u{3099}め".to_string()),
char_end: 2,
byte_end: 6,
char_map: Some(vec![(3, 6), (3, 3)]),
script: Script::Cj,
language: Some(Language::Jpn),
kind: TokenKind::Word,
..Default::default()
},
Token {
lemma: Owned("た\u{3099}め駄目た\u{3099}めHi".to_string()),
char_end: 8,
byte_end: 20,
char_map: Some(vec![
(3, 6),
(3, 3),
(3, 3),
(3, 3),
(3, 6),
(3, 3),
(1, 1),
(1, 1),
]),
script: Script::Cj,
language: Some(Language::Jpn),
kind: TokenKind::Word,
..Default::default()
},
]
}
test_normalizer!(JapaneseNormalizer, tokens(), normalizer_result(), normalized_tokens());
}