use serde::{Deserialize, Serialize};
use std::borrow::Cow;
use crate::char::{Char, CharPool};
use crate::transliterator::{Transliterator, TransliteratorFactory};
use crate::{TransliterationError, TransliteratorFactoryError};
#[derive(Debug, Clone, Copy, PartialEq)]
struct CharType(u8);
impl CharType {
const OTHER: CharType = CharType(0x00);
const HIRAGANA: CharType = CharType(0x20);
const KATAKANA: CharType = CharType(0x40);
const ALPHABET: CharType = CharType(0x60);
const DIGIT: CharType = CharType(0x80);
const EITHER: CharType = CharType(0xa0);
const HALFWIDTH: CharType = CharType(1 << 0);
const VOWEL_ENDED: CharType = CharType(1 << 1);
const HATSUON: CharType = CharType(1 << 2);
const SOKUON: CharType = CharType(1 << 3);
const PROLONGED_SOUND_MARK: CharType = CharType(1 << 4);
const HALFWIDTH_DIGIT: CharType = CharType(Self::DIGIT.0 | Self::HALFWIDTH.0);
const FULLWIDTH_DIGIT: CharType = CharType(Self::DIGIT.0);
const HALFWIDTH_ALPHABET: CharType = CharType(Self::ALPHABET.0 | Self::HALFWIDTH.0);
const FULLWIDTH_ALPHABET: CharType = CharType(Self::ALPHABET.0);
const ORDINARY_HIRAGANA: CharType = CharType(Self::HIRAGANA.0 | Self::VOWEL_ENDED.0);
const ORDINARY_KATAKANA: CharType = CharType(Self::KATAKANA.0 | Self::VOWEL_ENDED.0);
const ORDINARY_HALFWIDTH_KATAKANA: CharType =
CharType(Self::KATAKANA.0 | Self::VOWEL_ENDED.0 | Self::HALFWIDTH.0);
fn is_alnum(&self) -> bool {
let masked = self.0 & 0xe0;
masked == CharType::ALPHABET.0 || masked == CharType::DIGIT.0
}
fn is_halfwidth(&self) -> bool {
self.0 & CharType::HALFWIDTH.0 != 0
}
fn is_kana(&self) -> bool {
let masked = self.0 & 0xe0;
masked == CharType::HIRAGANA.0
|| masked == CharType::KATAKANA.0
|| masked == CharType::EITHER.0
}
fn get_special_char_type(codepoint: u32) -> Option<Self> {
match codepoint {
0xff70 => {
Some(CharType::KATAKANA | CharType::PROLONGED_SOUND_MARK | CharType::HALFWIDTH)
}
0x30fc => Some(CharType::EITHER | CharType::PROLONGED_SOUND_MARK),
0x3063 => Some(CharType::HIRAGANA | CharType::SOKUON),
0x3093 => Some(CharType::HIRAGANA | CharType::HATSUON),
0x30c3 => Some(CharType::KATAKANA | CharType::SOKUON),
0x30f3 => Some(CharType::KATAKANA | CharType::HATSUON),
0xff6f => Some(CharType::KATAKANA | CharType::SOKUON | CharType::HALFWIDTH),
0xff9d => Some(CharType::KATAKANA | CharType::HATSUON | CharType::HALFWIDTH),
_ => None,
}
}
fn from_codepoint(codepoint: u32) -> Self {
if (0x30..=0x39).contains(&codepoint) {
return CharType::HALFWIDTH_DIGIT;
}
if (0xff10..=0xff19).contains(&codepoint) {
return CharType::FULLWIDTH_DIGIT;
}
if (0x41..=0x5a).contains(&codepoint) || (0x61..=0x7a).contains(&codepoint) {
return CharType::HALFWIDTH_ALPHABET;
}
if (0xff21..=0xff3a).contains(&codepoint) || (0xff41..=0xff5a).contains(&codepoint) {
return CharType::FULLWIDTH_ALPHABET;
}
if let Some(special_type) = Self::get_special_char_type(codepoint) {
return special_type;
}
if (0x3041..=0x309c).contains(&codepoint) || codepoint == 0x309f {
return CharType::ORDINARY_HIRAGANA;
}
if (0x30a1..=0x30fa).contains(&codepoint) || (0x30fd..=0x30ff).contains(&codepoint) {
return CharType::ORDINARY_KATAKANA;
}
if (0xff66..=0xff6f).contains(&codepoint) || (0xff71..=0xff9f).contains(&codepoint) {
return CharType::ORDINARY_HALFWIDTH_KATAKANA;
}
CharType::OTHER
}
}
impl std::ops::BitOr for CharType {
type Output = CharType;
fn bitor(self, rhs: CharType) -> CharType {
CharType(self.0 | rhs.0)
}
}
impl std::ops::BitOrAssign for CharType {
fn bitor_assign(&mut self, rhs: CharType) {
self.0 |= rhs.0;
}
}
#[derive(Debug, Default, Clone, PartialEq, Deserialize, Serialize)]
pub struct ProlongedSoundMarksTransliteratorOptions {
#[serde(default)]
pub skip_already_transliterated_chars: bool,
#[serde(default)]
pub allow_prolonged_hatsuon: bool,
#[serde(default)]
pub allow_prolonged_sokuon: bool,
#[serde(default)]
pub replace_prolonged_marks_following_alnums: bool,
#[serde(default)]
pub replace_prolonged_marks_between_non_kanas: bool,
}
#[derive(Debug, Clone)]
pub struct ProlongedSoundMarksTransliterator {
options: ProlongedSoundMarksTransliteratorOptions,
prolongables: CharType,
}
impl ProlongedSoundMarksTransliterator {
pub fn new(options: ProlongedSoundMarksTransliteratorOptions) -> Self {
let mut prolongables = CharType::VOWEL_ENDED | CharType::PROLONGED_SOUND_MARK;
if options.allow_prolonged_hatsuon {
prolongables |= CharType::HATSUON;
}
if options.allow_prolonged_sokuon {
prolongables |= CharType::SOKUON;
}
Self {
options,
prolongables,
}
}
fn is_prolonged_mark(c: &str) -> bool {
matches!(
c,
"\u{002D}" | "\u{2010}" | "\u{2014}" | "\u{2015}" | "\u{2212}" | "\u{FF0D}" | "\u{FF70}" | "\u{30FC}" )
}
}
impl Transliterator for ProlongedSoundMarksTransliterator {
fn transliterate<'a, 'b>(
&self,
pool: &mut CharPool<'a, 'b>,
chars: &[&'a Char<'a, 'b>],
) -> Result<Vec<&'a Char<'a, 'b>>, TransliterationError> {
let mut result = Vec::new();
let mut lookahead_buf: Vec<&'a Char<'a, 'b>> = Vec::new();
let mut last_non_prolonged_char: Option<(&'a Char<'a, 'b>, CharType)> = None;
let mut processed_chars_in_lookahead = false;
let mut offset = 0;
for ¤t_char in chars {
if current_char.c.is_empty() {
if !lookahead_buf.is_empty() {
let sentinel_type = CharType::OTHER;
let prev_non_prolonged = last_non_prolonged_char;
let replace_by_alnum = self.options.replace_prolonged_marks_following_alnums
&& prev_non_prolonged.is_none_or(|(_, t)| t.is_alnum());
let replace_by_non_kana =
self.options.replace_prolonged_marks_between_non_kanas
&& prev_non_prolonged.is_none_or(|(_, t)| !t.is_kana())
&& !sentinel_type.is_kana();
if (replace_by_alnum || replace_by_non_kana)
&& (!self.options.skip_already_transliterated_chars
|| !processed_chars_in_lookahead)
{
let replacement = if replace_by_non_kana {
let prev_half =
prev_non_prolonged.is_none_or(|(_, t)| t.is_halfwidth());
let next_half = sentinel_type.is_halfwidth();
if !prev_half && !next_half {
"\u{FF0D}"
} else {
"\u{002D}"
}
} else if prev_non_prolonged
.map_or(sentinel_type.is_halfwidth(), |(_, t)| t.is_halfwidth())
{
"\u{002D}"
} else {
"\u{FF0D}"
};
for &lookahead_char in &lookahead_buf {
let new_char = pool.new_char_from(
Cow::Borrowed(replacement),
offset,
lookahead_char,
);
offset += replacement.len();
result.push(new_char);
}
} else {
for &lookahead_char in &lookahead_buf {
let new_char = pool.new_with_offset(lookahead_char, offset);
offset += lookahead_char.c.len();
result.push(new_char);
}
}
lookahead_buf.clear();
}
result.push(current_char);
continue;
}
if !lookahead_buf.is_empty() {
if Self::is_prolonged_mark(current_char.c.as_ref()) {
if current_char.source.is_some() {
processed_chars_in_lookahead = true;
}
lookahead_buf.push(current_char);
} else {
let prev_non_prolonged_char = last_non_prolonged_char;
let current_codepoint = current_char.c.chars().next().unwrap() as u32;
let current_char_type = CharType::from_codepoint(current_codepoint);
last_non_prolonged_char = Some((current_char, current_char_type));
let replace_by_alnum = self.options.replace_prolonged_marks_following_alnums
&& prev_non_prolonged_char.is_none_or(|(_, t)| t.is_alnum());
let replace_by_non_kana =
self.options.replace_prolonged_marks_between_non_kanas
&& prev_non_prolonged_char.is_none_or(|(_, t)| !t.is_kana())
&& !current_char_type.is_kana();
if (replace_by_alnum || replace_by_non_kana)
&& (!self.options.skip_already_transliterated_chars
|| !processed_chars_in_lookahead)
{
let replacement = if replace_by_non_kana {
let prev_half =
prev_non_prolonged_char.is_none_or(|(_, t)| t.is_halfwidth());
let next_half = current_char_type.is_halfwidth();
if !prev_half && !next_half {
"\u{FF0D}"
} else {
"\u{002D}"
}
} else if match prev_non_prolonged_char {
Some(c) => c.1.is_halfwidth(),
None => current_char_type.is_halfwidth(),
} {
"\u{002D}"
} else {
"\u{FF0D}"
};
for &lookahead_char in &lookahead_buf {
let new_char = pool.new_char_from(
Cow::Borrowed(replacement),
offset,
lookahead_char,
);
offset += replacement.len();
result.push(new_char);
}
} else {
for &lookahead_char in &lookahead_buf {
let new_char = pool.new_with_offset(lookahead_char, offset);
offset += lookahead_char.c.len();
result.push(new_char);
}
}
lookahead_buf.clear();
result.push(current_char);
processed_chars_in_lookahead = false;
}
continue;
}
if Self::is_prolonged_mark(current_char.c.as_ref()) {
let should_process = !self.options.skip_already_transliterated_chars
|| current_char.source.is_none();
if should_process {
if let Some(last_non_prolonged_char) = last_non_prolonged_char {
let (_last_char, last_char_type) = last_non_prolonged_char;
if (self.prolongables.0 & last_char_type.0) != 0 {
let replacement = if last_char_type.is_halfwidth() {
"\u{FF70}" } else {
"\u{30FC}" };
let nc = pool.new_char_from(
Cow::Borrowed(replacement),
offset,
current_char,
);
offset += replacement.len();
result.push(nc);
continue;
} else {
if (self.options.replace_prolonged_marks_following_alnums
&& last_char_type.is_alnum())
|| (self.options.replace_prolonged_marks_between_non_kanas
&& !last_char_type.is_kana())
{
lookahead_buf.push(current_char);
continue;
}
}
}
}
} else {
let codepoint = current_char.c.chars().next().unwrap() as u32;
let char_type = CharType::from_codepoint(codepoint);
last_non_prolonged_char = Some((current_char, char_type));
}
result.push(pool.new_with_offset(current_char, offset));
}
Ok(result)
}
}
impl TransliteratorFactory for ProlongedSoundMarksTransliteratorOptions {
fn new_transliterator(&self) -> Result<Box<dyn Transliterator>, TransliteratorFactoryError> {
Ok(Box::new(ProlongedSoundMarksTransliterator::new(
self.clone(),
)))
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::char::{from_chars, CharPool};
#[test]
fn test_basic_prolonged_sound_replacement() {
let options = ProlongedSoundMarksTransliteratorOptions::default();
let transliterator = ProlongedSoundMarksTransliterator::new(options);
let mut pool = CharPool::new();
let input_chars = pool.build_char_array("かー");
let result = transliterator
.transliterate(&mut pool, &input_chars)
.unwrap();
assert_eq!(result.len(), 3); assert_eq!(result[0].c, "か");
assert_eq!(result[1].c, "\u{30FC}"); assert!(result[2].c.is_empty()); }
#[test]
fn test_katakana_prolonged_sound_replacement() {
let options = ProlongedSoundMarksTransliteratorOptions::default();
let transliterator = ProlongedSoundMarksTransliterator::new(options);
let mut pool = CharPool::new();
let input_chars = pool.build_char_array("カー");
let result = transliterator
.transliterate(&mut pool, &input_chars)
.unwrap();
assert_eq!(result.len(), 3);
assert_eq!(result[0].c, "カ");
assert_eq!(result[1].c, "\u{30FC}"); assert!(result[2].c.is_empty()); }
#[test]
fn test_halfwidth_katakana_prolonged_sound() {
let options = ProlongedSoundMarksTransliteratorOptions::default();
let transliterator = ProlongedSoundMarksTransliterator::new(options);
let mut pool = CharPool::new();
let input_chars = pool.build_char_array("カー");
let result = transliterator
.transliterate(&mut pool, &input_chars)
.unwrap();
assert_eq!(result.len(), 3);
assert_eq!(result[0].c, "カ");
assert_eq!(result[1].c, "\u{FF70}"); assert!(result[2].c.is_empty()); }
#[test]
fn test_hyphen_replacement_between_alphabet() {
let options = ProlongedSoundMarksTransliteratorOptions {
replace_prolonged_marks_following_alnums: true,
..Default::default()
};
let transliterator = ProlongedSoundMarksTransliterator::new(options);
let mut pool = CharPool::new();
let input_chars = pool.build_char_array("a-b");
let result = transliterator
.transliterate(&mut pool, &input_chars)
.unwrap();
assert_eq!(result.len(), 4);
assert_eq!(result[0].c, "a");
assert_eq!(result[1].c, "\u{002D}"); assert_eq!(result[2].c, "b");
assert!(result[3].c.is_empty()); }
#[test]
fn test_em_dash_replacement_between_fullwidth() {
let options = ProlongedSoundMarksTransliteratorOptions {
replace_prolonged_marks_following_alnums: true,
..Default::default()
};
let transliterator = ProlongedSoundMarksTransliterator::new(options);
let mut pool = CharPool::new();
let input_chars = pool.build_char_array("a—b");
let result = transliterator
.transliterate(&mut pool, &input_chars)
.unwrap();
assert_eq!(result.len(), 4);
assert_eq!(result[0].c, "a");
assert_eq!(result[1].c, "\u{FF0D}"); assert_eq!(result[2].c, "b");
assert!(result[3].c.is_empty()); }
#[test]
fn test_allow_prolonged_hatsuon() {
let options = ProlongedSoundMarksTransliteratorOptions {
allow_prolonged_hatsuon: true,
..Default::default()
};
let transliterator = ProlongedSoundMarksTransliterator::new(options);
let mut pool = CharPool::new();
let input_chars = pool.build_char_array("んー");
let result = transliterator
.transliterate(&mut pool, &input_chars)
.unwrap();
assert_eq!(result.len(), 3);
assert_eq!(result[0].c, "ん");
assert_eq!(result[1].c, "\u{30FC}"); assert!(result[2].c.is_empty()); }
#[test]
fn test_allow_prolonged_sokuon() {
let options = ProlongedSoundMarksTransliteratorOptions {
allow_prolonged_sokuon: true,
..Default::default()
};
let transliterator = ProlongedSoundMarksTransliterator::new(options);
let mut pool = CharPool::new();
let input_chars = pool.build_char_array("っー");
let result = transliterator
.transliterate(&mut pool, &input_chars)
.unwrap();
assert_eq!(result.len(), 3);
assert_eq!(result[0].c, "っ");
assert_eq!(result[1].c, "\u{30FC}"); assert!(result[2].c.is_empty()); }
#[test]
fn test_no_replacement_for_non_japanese() {
let options = ProlongedSoundMarksTransliteratorOptions::default();
let transliterator = ProlongedSoundMarksTransliterator::new(options);
let mut pool = CharPool::new();
let input_chars = pool.build_char_array("x-y");
let result = transliterator
.transliterate(&mut pool, &input_chars)
.unwrap();
assert_eq!(result.len(), 4);
assert_eq!(result[0].c, "x");
assert_eq!(result[1].c, "-"); assert_eq!(result[2].c, "y");
assert!(result[3].c.is_empty()); }
#[test]
fn test_multiple_dashes_replacement() {
let options = ProlongedSoundMarksTransliteratorOptions {
replace_prolonged_marks_following_alnums: true,
..Default::default()
};
let transliterator = ProlongedSoundMarksTransliterator::new(options);
let mut pool = CharPool::new();
let input_chars = pool.build_char_array("a--b");
let result = transliterator
.transliterate(&mut pool, &input_chars)
.unwrap();
assert_eq!(result.len(), 5);
assert_eq!(result[0].c, "a");
assert_eq!(result[1].c, "\u{002D}"); assert_eq!(result[2].c, "\u{002D}");
assert_eq!(result[3].c, "b");
assert!(result[4].c.is_empty()); }
#[test]
fn test_skip_already_transliterated() {
let options = ProlongedSoundMarksTransliteratorOptions {
skip_already_transliterated_chars: true,
..Default::default()
};
let transliterator = ProlongedSoundMarksTransliterator::new(options);
let mut pool = CharPool::new();
let input_chars = pool.build_char_array("か");
let source_char = input_chars[0];
let dash_char = pool.new_char_from(Cow::Borrowed("-"), 1, source_char);
let chars_with_source = vec![source_char, dash_char, input_chars[1]]; let result = transliterator
.transliterate(&mut pool, &chars_with_source)
.unwrap();
assert_eq!(result.len(), 3);
assert_eq!(result[0].c, "か");
assert_eq!(result[1].c, "-"); assert!(result[2].c.is_empty()); }
#[test]
fn test_mixed_script_text() {
let options = ProlongedSoundMarksTransliteratorOptions::default();
let transliterator = ProlongedSoundMarksTransliterator::new(options);
let mut pool = CharPool::new();
let input_chars = pool.build_char_array("コーヒー");
let result = transliterator
.transliterate(&mut pool, &input_chars)
.unwrap();
assert_eq!(result.len(), 5); assert_eq!(result[0].c, "コ");
assert_eq!(result[1].c, "\u{30FC}"); assert_eq!(result[2].c, "ヒ");
assert_eq!(result[3].c, "\u{30FC}"); assert!(result[4].c.is_empty()); }
#[test]
fn test_no_replacement_without_preceding_char() {
let options = ProlongedSoundMarksTransliteratorOptions::default();
let transliterator = ProlongedSoundMarksTransliterator::new(options);
let mut pool = CharPool::new();
let input_chars = pool.build_char_array("-か");
let result = transliterator
.transliterate(&mut pool, &input_chars)
.unwrap();
assert_eq!(result.len(), 3);
assert_eq!(result[0].c, "-"); assert_eq!(result[1].c, "か");
assert!(result[2].c.is_empty()); }
#[test]
fn test_character_type_detection() {
assert_eq!(
CharType::from_codepoint('a' as u32),
CharType::HALFWIDTH_ALPHABET
);
assert_eq!(
CharType::from_codepoint('a' as u32),
CharType::FULLWIDTH_ALPHABET
);
assert_eq!(
CharType::from_codepoint('1' as u32),
CharType::HALFWIDTH_DIGIT
);
assert_eq!(
CharType::from_codepoint('1' as u32),
CharType::FULLWIDTH_DIGIT
);
assert_eq!(
CharType::from_codepoint('あ' as u32),
CharType::ORDINARY_HIRAGANA
);
assert_eq!(
CharType::from_codepoint('ア' as u32),
CharType::ORDINARY_KATAKANA
);
assert_eq!(
CharType::from_codepoint('ア' as u32),
CharType::ORDINARY_HALFWIDTH_KATAKANA
);
assert_eq!(
CharType::from_codepoint(0x30fc),
CharType::EITHER | CharType::PROLONGED_SOUND_MARK
);
assert_eq!(
CharType::from_codepoint(0xff70),
CharType::KATAKANA | CharType::PROLONGED_SOUND_MARK | CharType::HALFWIDTH
);
assert_eq!(
CharType::from_codepoint(0x3093),
CharType::HIRAGANA | CharType::HATSUON
);
assert_eq!(
CharType::from_codepoint(0x3063),
CharType::HIRAGANA | CharType::SOKUON
);
}
#[test]
fn test_fullwidth_hyphen_replacement_in_katakana() {
let options = ProlongedSoundMarksTransliteratorOptions::default();
let transliterator = ProlongedSoundMarksTransliterator::new(options);
let mut pool = CharPool::new();
let input_chars = pool.build_char_array("イ\u{FF0D}ハト\u{FF0D}ヴォ");
let result = transliterator
.transliterate(&mut pool, &input_chars)
.unwrap();
assert_eq!(result.len(), 8); assert_eq!(result[0].c, "イ");
assert_eq!(result[1].c, "\u{30FC}"); assert_eq!(result[2].c, "ハ");
assert_eq!(result[3].c, "ト");
assert_eq!(result[4].c, "\u{30FC}"); assert_eq!(result[5].c, "ヴ");
assert_eq!(result[6].c, "ォ");
assert!(result[7].c.is_empty());
let input_chars2 = pool.build_char_array("カトラリ\u{FF0D}");
let result2 = transliterator
.transliterate(&mut pool, &input_chars2)
.unwrap();
assert_eq!(result2.len(), 6); assert_eq!(result2[0].c, "カ");
assert_eq!(result2[1].c, "ト");
assert_eq!(result2[2].c, "ラ");
assert_eq!(result2[3].c, "リ");
assert_eq!(result2[4].c, "\u{30FC}"); assert!(result2[5].c.is_empty()); }
#[test]
fn test_hyphen_minus_replacement_in_katakana() {
let options = ProlongedSoundMarksTransliteratorOptions::default();
let transliterator = ProlongedSoundMarksTransliterator::new(options);
let mut pool = CharPool::new();
let input_chars = pool.build_char_array("イ\u{002D}ハト\u{002D}ヴォ");
let result = transliterator
.transliterate(&mut pool, &input_chars)
.unwrap();
assert_eq!(result.len(), 8);
assert_eq!(result[0].c, "イ");
assert_eq!(result[1].c, "\u{30FC}"); assert_eq!(result[2].c, "ハ");
assert_eq!(result[3].c, "ト");
assert_eq!(result[4].c, "\u{30FC}"); assert_eq!(result[5].c, "ヴ");
assert_eq!(result[6].c, "ォ");
assert!(result[7].c.is_empty());
let input_chars2 = pool.build_char_array("カトラリ\u{002D}");
let result2 = transliterator
.transliterate(&mut pool, &input_chars2)
.unwrap();
assert_eq!(result2.len(), 6);
assert_eq!(result2[0].c, "カ");
assert_eq!(result2[1].c, "ト");
assert_eq!(result2[2].c, "ラ");
assert_eq!(result2[3].c, "リ");
assert_eq!(result2[4].c, "\u{30FC}"); assert!(result2[5].c.is_empty()); }
#[test]
fn test_prolonged_marks_following_digits_no_replacement() {
let options = ProlongedSoundMarksTransliteratorOptions::default();
let transliterator = ProlongedSoundMarksTransliterator::new(options);
let mut pool = CharPool::new();
let input_chars = pool.build_char_array("1\u{30FC}\u{FF0D}2\u{30FC}3");
let result = transliterator
.transliterate(&mut pool, &input_chars)
.unwrap();
assert_eq!(result.len(), 7);
assert_eq!(result[0].c, "1");
assert_eq!(result[1].c, "\u{30FC}");
assert_eq!(result[2].c, "\u{FF0D}");
assert_eq!(result[3].c, "2");
assert_eq!(result[4].c, "\u{30FC}");
assert_eq!(result[5].c, "3");
assert!(result[6].c.is_empty()); }
#[test]
fn test_prolonged_marks_following_digits_with_replacement() {
let options = ProlongedSoundMarksTransliteratorOptions {
replace_prolonged_marks_following_alnums: true,
..Default::default()
};
let transliterator = ProlongedSoundMarksTransliterator::new(options);
let mut pool = CharPool::new();
let input_chars = pool.build_char_array("1\u{30FC}\u{FF0D}2\u{30FC}3");
let result = transliterator
.transliterate(&mut pool, &input_chars)
.unwrap();
assert_eq!(result.len(), 7);
assert_eq!(result[0].c, "1");
assert_eq!(result[1].c, "\u{002D}"); assert_eq!(result[2].c, "\u{002D}"); assert_eq!(result[3].c, "2");
assert_eq!(result[4].c, "\u{002D}"); assert_eq!(result[5].c, "3");
assert!(result[6].c.is_empty()); }
#[test]
fn test_prolonged_marks_following_fullwidth_digits_with_replacement() {
let options = ProlongedSoundMarksTransliteratorOptions {
replace_prolonged_marks_following_alnums: true,
..Default::default()
};
let transliterator = ProlongedSoundMarksTransliterator::new(options);
let mut pool = CharPool::new();
let input_chars = pool.build_char_array("\u{FF11}\u{30FC}\u{FF0D}\u{FF12}\u{30FC}\u{FF13}");
let result = transliterator
.transliterate(&mut pool, &input_chars)
.unwrap();
assert_eq!(result.len(), 7);
assert_eq!(result[0].c, "\u{FF11}"); assert_eq!(result[1].c, "\u{FF0D}"); assert_eq!(result[2].c, "\u{FF0D}"); assert_eq!(result[3].c, "\u{FF12}"); assert_eq!(result[4].c, "\u{FF0D}"); assert_eq!(result[5].c, "\u{FF13}"); assert!(result[6].c.is_empty()); }
#[test]
fn test_sokuon_without_allow_prolonged() {
let options = ProlongedSoundMarksTransliteratorOptions::default();
let transliterator = ProlongedSoundMarksTransliterator::new(options);
let mut pool = CharPool::new();
let input_chars = pool.build_char_array("ウッ\u{FF0D}ウン\u{FF0D}");
let result = transliterator
.transliterate(&mut pool, &input_chars)
.unwrap();
assert_eq!(result.len(), 7);
assert_eq!(result[0].c, "ウ");
assert_eq!(result[1].c, "ッ");
assert_eq!(result[2].c, "\u{FF0D}"); assert_eq!(result[3].c, "ウ");
assert_eq!(result[4].c, "ン");
assert_eq!(result[5].c, "\u{FF0D}"); }
#[test]
fn test_sokuon_with_allow_prolonged() {
let options = ProlongedSoundMarksTransliteratorOptions {
allow_prolonged_sokuon: true,
..Default::default()
};
let transliterator = ProlongedSoundMarksTransliterator::new(options);
let mut pool = CharPool::new();
let input_chars = pool.build_char_array("ウッ\u{FF0D}ウン\u{FF0D}");
let result = transliterator
.transliterate(&mut pool, &input_chars)
.unwrap();
assert_eq!(result.len(), 7);
assert_eq!(result[0].c, "ウ");
assert_eq!(result[1].c, "ッ");
assert_eq!(result[2].c, "\u{30FC}"); assert_eq!(result[3].c, "ウ");
assert_eq!(result[4].c, "ン");
assert_eq!(result[5].c, "\u{FF0D}"); }
#[test]
fn test_hatsuon_with_allow_prolonged() {
let options = ProlongedSoundMarksTransliteratorOptions {
allow_prolonged_hatsuon: true,
..Default::default()
};
let transliterator = ProlongedSoundMarksTransliterator::new(options);
let mut pool = CharPool::new();
let input_chars = pool.build_char_array("ウッ\u{FF0D}ウン\u{FF0D}");
let result = transliterator
.transliterate(&mut pool, &input_chars)
.unwrap();
assert_eq!(result.len(), 7);
assert_eq!(result[0].c, "ウ");
assert_eq!(result[1].c, "ッ");
assert_eq!(result[2].c, "\u{FF0D}"); assert_eq!(result[3].c, "ウ");
assert_eq!(result[4].c, "ン");
assert_eq!(result[5].c, "\u{30FC}"); }
#[test]
fn test_both_sokuon_and_hatsuon_allowed() {
let options = ProlongedSoundMarksTransliteratorOptions {
allow_prolonged_sokuon: true,
allow_prolonged_hatsuon: true,
..Default::default()
};
let transliterator = ProlongedSoundMarksTransliterator::new(options);
let mut pool = CharPool::new();
let input_chars = pool.build_char_array("ウッ\u{FF0D}ウン\u{FF0D}");
let result = transliterator
.transliterate(&mut pool, &input_chars)
.unwrap();
assert_eq!(result.len(), 7);
assert_eq!(result[0].c, "ウ");
assert_eq!(result[1].c, "ッ");
assert_eq!(result[2].c, "\u{30FC}"); assert_eq!(result[3].c, "ウ");
assert_eq!(result[4].c, "ン");
assert_eq!(result[5].c, "\u{30FC}"); }
#[test]
fn test_prolonged_sound_marks_factory() {
let options = ProlongedSoundMarksTransliteratorOptions::default();
let factory_result = options.new_transliterator();
assert!(factory_result.is_ok());
let transliterator = factory_result.unwrap();
let test_cases = &[
("おかあさん", "おかあさん"),
("ラーメン", "ラーメン"),
("スーパー", "スーパー"),
("", ""),
];
for (input, expected) in test_cases {
let mut pool = CharPool::new();
let input_chars = pool.build_char_array(input);
let result = transliterator
.transliterate(&mut pool, &input_chars)
.unwrap();
let result_string = from_chars(result.iter().cloned());
assert_eq!(result_string, *expected, "Failed for input '{input}'");
}
}
#[test]
fn test_psm_between_non_kanas() {
let options = ProlongedSoundMarksTransliteratorOptions {
replace_prolonged_marks_between_non_kanas: true,
..Default::default()
};
let transliterator = ProlongedSoundMarksTransliterator::new(options);
let mut pool = CharPool::new();
let input_chars = pool.build_char_array("漢\u{30FC}字");
let result = transliterator
.transliterate(&mut pool, &input_chars)
.unwrap();
assert_eq!(from_chars(result.iter().cloned()), "漢\u{FF0D}字");
}
#[test]
fn test_psm_between_halfwidth_alnums_non_kana() {
let options = ProlongedSoundMarksTransliteratorOptions {
replace_prolonged_marks_between_non_kanas: true,
..Default::default()
};
let transliterator = ProlongedSoundMarksTransliterator::new(options);
let mut pool = CharPool::new();
let input_chars = pool.build_char_array("1\u{30FC}2");
let result = transliterator
.transliterate(&mut pool, &input_chars)
.unwrap();
assert_eq!(from_chars(result.iter().cloned()), "1\u{002D}2");
}
#[test]
fn test_psm_between_fullwidth_alnums_non_kana() {
let options = ProlongedSoundMarksTransliteratorOptions {
replace_prolonged_marks_between_non_kanas: true,
..Default::default()
};
let transliterator = ProlongedSoundMarksTransliterator::new(options);
let mut pool = CharPool::new();
let input_chars = pool.build_char_array("1\u{30FC}2");
let result = transliterator
.transliterate(&mut pool, &input_chars)
.unwrap();
assert_eq!(from_chars(result.iter().cloned()), "1\u{FF0D}2");
}
#[test]
fn test_psm_after_kana_not_replaced_non_kana_option() {
let options = ProlongedSoundMarksTransliteratorOptions {
replace_prolonged_marks_between_non_kanas: true,
..Default::default()
};
let transliterator = ProlongedSoundMarksTransliterator::new(options);
let mut pool = CharPool::new();
let input_chars = pool.build_char_array("カ\u{30FC}漢");
let result = transliterator
.transliterate(&mut pool, &input_chars)
.unwrap();
assert_eq!(from_chars(result.iter().cloned()), "カ\u{30FC}漢");
}
#[test]
fn test_psm_before_kana_not_replaced_non_kana_option() {
let options = ProlongedSoundMarksTransliteratorOptions {
replace_prolonged_marks_between_non_kanas: true,
..Default::default()
};
let transliterator = ProlongedSoundMarksTransliterator::new(options);
let mut pool = CharPool::new();
let input_chars = pool.build_char_array("漢\u{30FC}カ");
let result = transliterator
.transliterate(&mut pool, &input_chars)
.unwrap();
assert_eq!(from_chars(result.iter().cloned()), "漢\u{30FC}カ");
}
#[test]
fn test_consecutive_psms_between_non_kanas() {
let options = ProlongedSoundMarksTransliteratorOptions {
replace_prolonged_marks_between_non_kanas: true,
..Default::default()
};
let transliterator = ProlongedSoundMarksTransliterator::new(options);
let mut pool = CharPool::new();
let input_chars = pool.build_char_array("漢\u{30FC}\u{30FC}\u{30FC}字");
let result = transliterator
.transliterate(&mut pool, &input_chars)
.unwrap();
assert_eq!(
from_chars(result.iter().cloned()),
"漢\u{FF0D}\u{FF0D}\u{FF0D}字"
);
}
#[test]
fn test_consecutive_psms_before_kana_not_replaced() {
let options = ProlongedSoundMarksTransliteratorOptions {
replace_prolonged_marks_between_non_kanas: true,
..Default::default()
};
let transliterator = ProlongedSoundMarksTransliterator::new(options);
let mut pool = CharPool::new();
let input_chars = pool.build_char_array("漢\u{30FC}\u{30FC}\u{30FC}カ");
let result = transliterator
.transliterate(&mut pool, &input_chars)
.unwrap();
assert_eq!(
from_chars(result.iter().cloned()),
"漢\u{30FC}\u{30FC}\u{30FC}カ"
);
}
#[test]
fn test_trailing_psms_after_fullwidth_non_kana() {
let options = ProlongedSoundMarksTransliteratorOptions {
replace_prolonged_marks_between_non_kanas: true,
..Default::default()
};
let transliterator = ProlongedSoundMarksTransliterator::new(options);
let mut pool = CharPool::new();
let input_chars = pool.build_char_array("漢\u{30FC}\u{30FC}\u{30FC}");
let result = transliterator
.transliterate(&mut pool, &input_chars)
.unwrap();
assert_eq!(
from_chars(result.iter().cloned()),
"漢\u{FF0D}\u{FF0D}\u{FF0D}"
);
}
#[test]
fn test_trailing_psms_after_halfwidth_non_kana() {
let options = ProlongedSoundMarksTransliteratorOptions {
replace_prolonged_marks_between_non_kanas: true,
..Default::default()
};
let transliterator = ProlongedSoundMarksTransliterator::new(options);
let mut pool = CharPool::new();
let input_chars = pool.build_char_array("1\u{30FC}\u{30FC}\u{30FC}");
let result = transliterator
.transliterate(&mut pool, &input_chars)
.unwrap();
assert_eq!(
from_chars(result.iter().cloned()),
"1\u{002D}\u{002D}\u{002D}"
);
}
#[test]
fn test_non_kana_only_psm_after_alnum_before_kana() {
let options = ProlongedSoundMarksTransliteratorOptions {
replace_prolonged_marks_between_non_kanas: true,
..Default::default()
};
let transliterator = ProlongedSoundMarksTransliterator::new(options);
let mut pool = CharPool::new();
let input_chars = pool.build_char_array("A\u{30FC}カ");
let result = transliterator
.transliterate(&mut pool, &input_chars)
.unwrap();
assert_eq!(from_chars(result.iter().cloned()), "A\u{30FC}カ");
}
#[test]
fn test_both_options_psm_after_alnum_before_kana() {
let options = ProlongedSoundMarksTransliteratorOptions {
replace_prolonged_marks_following_alnums: true,
replace_prolonged_marks_between_non_kanas: true,
..Default::default()
};
let transliterator = ProlongedSoundMarksTransliterator::new(options);
let mut pool = CharPool::new();
let input_chars = pool.build_char_array("A\u{30FC}カ");
let result = transliterator
.transliterate(&mut pool, &input_chars)
.unwrap();
assert_eq!(from_chars(result.iter().cloned()), "A\u{002D}カ");
}
}