#![allow(clippy::struct_excessive_bools)]
#![allow(clippy::items_after_statements)]
#![allow(clippy::cast_precision_loss)]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum NormForm {
None,
Nfc,
Nfd,
Nfkc,
Nfkd,
}
impl Default for NormForm {
fn default() -> Self {
Self::Nfkc
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CharWidth {
Full,
Half,
}
#[derive(Debug, Clone)]
pub struct Normalizer {
pub form: NormForm,
pub halfwidth_numbers: bool,
pub halfwidth_ascii: bool,
pub fullwidth_katakana: bool,
pub collapse_whitespace: bool,
pub trim: bool,
pub lowercase: bool,
pub remove_zero_width: bool,
}
impl Default for Normalizer {
fn default() -> Self {
Self {
form: NormForm::Nfkc,
halfwidth_numbers: true,
halfwidth_ascii: true,
fullwidth_katakana: true,
collapse_whitespace: true,
trim: true,
lowercase: false,
remove_zero_width: true,
}
}
}
impl Normalizer {
pub fn new() -> Self {
Self::default()
}
pub fn minimal() -> Self {
Self {
form: NormForm::None,
halfwidth_numbers: false,
halfwidth_ascii: false,
fullwidth_katakana: false,
collapse_whitespace: false,
trim: false,
lowercase: false,
remove_zero_width: false,
}
}
#[must_use]
pub fn with_form(mut self, form: NormForm) -> Self {
self.form = form;
self
}
#[must_use]
pub fn with_lowercase(mut self, lowercase: bool) -> Self {
self.lowercase = lowercase;
self
}
pub fn normalize(&self, text: &str) -> String {
let mut result = text.to_string();
if self.form != NormForm::None {
result = Self::apply_unicode_normalization(&result);
}
if self.halfwidth_numbers || self.halfwidth_ascii || self.fullwidth_katakana {
result = self.convert_widths(&result);
}
if self.remove_zero_width {
result = Self::remove_zero_width_chars(&result);
}
if self.collapse_whitespace {
result = Self::collapse_ws(&result);
}
if self.trim {
result = result.trim().to_string();
}
if self.lowercase {
result = result.to_lowercase();
}
result
}
fn apply_unicode_normalization(text: &str) -> String {
let mut result = String::with_capacity(text.len());
for c in text.chars() {
match c {
'\u{FF01}'..='\u{FF5E}' => {
let ascii = ((c as u32) - 0xFF01 + 0x21) as u8 as char;
result.push(ascii);
}
'\u{3000}' => result.push(' '),
'\u{301C}' => result.push('\u{FF5E}'),
_ => result.push(c),
}
}
result
}
fn convert_widths(&self, text: &str) -> String {
let mut result = String::with_capacity(text.len());
for c in text.chars() {
let converted = self.convert_char_width(c);
result.push(converted);
}
result
}
fn convert_char_width(&self, c: char) -> char {
if self.halfwidth_numbers {
if let Some(half) = fullwidth_digit_to_half(c) {
return half;
}
}
if self.halfwidth_ascii {
if let Some(half) = fullwidth_alpha_to_half(c) {
return half;
}
}
if self.fullwidth_katakana {
if let Some(full) = halfwidth_kana_to_full(c) {
return full;
}
}
c
}
fn remove_zero_width_chars(text: &str) -> String {
text.chars().filter(|&c| !is_zero_width(c)).collect()
}
fn collapse_ws(text: &str) -> String {
let mut result = String::with_capacity(text.len());
let mut prev_space = false;
for c in text.chars() {
if c.is_whitespace() {
if !prev_space {
result.push(' ');
prev_space = true;
}
} else {
result.push(c);
prev_space = false;
}
}
result
}
}
fn is_zero_width(c: char) -> bool {
matches!(
c,
'\u{200B}' | '\u{200C}' | '\u{200D}' | '\u{FEFF}' | '\u{2060}' )
}
fn fullwidth_digit_to_half(c: char) -> Option<char> {
match c {
'\u{FF10}' => Some('0'),
'\u{FF11}' => Some('1'),
'\u{FF12}' => Some('2'),
'\u{FF13}' => Some('3'),
'\u{FF14}' => Some('4'),
'\u{FF15}' => Some('5'),
'\u{FF16}' => Some('6'),
'\u{FF17}' => Some('7'),
'\u{FF18}' => Some('8'),
'\u{FF19}' => Some('9'),
_ => None,
}
}
fn fullwidth_alpha_to_half(c: char) -> Option<char> {
let code = c as u32;
if (0xFF21..=0xFF3A).contains(&code) {
return Some((code - 0xFF21 + 0x41) as u8 as char);
}
if (0xFF41..=0xFF5A).contains(&code) {
return Some((code - 0xFF41 + 0x61) as u8 as char);
}
None
}
fn halfwidth_kana_to_full(c: char) -> Option<char> {
let code = c as u32;
if !(0xFF65..=0xFF9F).contains(&code) {
return None;
}
const MAPPING: &[(char, char)] = &[
('\u{FF66}', '\u{30F2}'), ('\u{FF67}', '\u{30A1}'), ('\u{FF68}', '\u{30A3}'), ('\u{FF69}', '\u{30A5}'), ('\u{FF6A}', '\u{30A7}'), ('\u{FF6B}', '\u{30A9}'), ('\u{FF6C}', '\u{30E3}'), ('\u{FF6D}', '\u{30E5}'), ('\u{FF6E}', '\u{30E7}'), ('\u{FF6F}', '\u{30C3}'), ('\u{FF70}', '\u{30FC}'), ('\u{FF71}', '\u{30A2}'), ('\u{FF72}', '\u{30A4}'), ('\u{FF73}', '\u{30A6}'), ('\u{FF74}', '\u{30A8}'), ('\u{FF75}', '\u{30AA}'), ('\u{FF76}', '\u{30AB}'), ('\u{FF77}', '\u{30AD}'), ('\u{FF78}', '\u{30AF}'), ('\u{FF79}', '\u{30B1}'), ('\u{FF7A}', '\u{30B3}'), ('\u{FF7B}', '\u{30B5}'), ('\u{FF7C}', '\u{30B7}'), ('\u{FF7D}', '\u{30B9}'), ('\u{FF7E}', '\u{30BB}'), ('\u{FF7F}', '\u{30BD}'), ('\u{FF80}', '\u{30BF}'), ('\u{FF81}', '\u{30C1}'), ('\u{FF82}', '\u{30C4}'), ('\u{FF83}', '\u{30C6}'), ('\u{FF84}', '\u{30C8}'), ('\u{FF85}', '\u{30CA}'), ('\u{FF86}', '\u{30CB}'), ('\u{FF87}', '\u{30CC}'), ('\u{FF88}', '\u{30CD}'), ('\u{FF89}', '\u{30CE}'), ('\u{FF8A}', '\u{30CF}'), ('\u{FF8B}', '\u{30D2}'), ('\u{FF8C}', '\u{30D5}'), ('\u{FF8D}', '\u{30D8}'), ('\u{FF8E}', '\u{30DB}'), ('\u{FF8F}', '\u{30DE}'), ('\u{FF90}', '\u{30DF}'), ('\u{FF91}', '\u{30E0}'), ('\u{FF92}', '\u{30E1}'), ('\u{FF93}', '\u{30E2}'), ('\u{FF94}', '\u{30E4}'), ('\u{FF95}', '\u{30E6}'), ('\u{FF96}', '\u{30E8}'), ('\u{FF97}', '\u{30E9}'), ('\u{FF98}', '\u{30EA}'), ('\u{FF99}', '\u{30EB}'), ('\u{FF9A}', '\u{30EC}'), ('\u{FF9B}', '\u{30ED}'), ('\u{FF9C}', '\u{30EF}'), ('\u{FF9D}', '\u{30F3}'), ('\u{FF9E}', '\u{309B}'), ('\u{FF9F}', '\u{309C}'), ];
for &(half, full) in MAPPING {
if c == half {
return Some(full);
}
}
None
}
pub fn to_nfkc(text: &str) -> String {
Normalizer::new().normalize(text)
}
pub fn remove_whitespace(text: &str) -> String {
text.chars().filter(|c| !c.is_whitespace()).collect()
}
pub fn normalize_quotes(text: &str) -> String {
let mut result = String::with_capacity(text.len());
for c in text.chars() {
let normalized = match c {
'「' | '『' | '\u{FF62}' => '「',
'」' | '』' | '\u{FF63}' => '」',
'(' | '(' => '(',
')' | ')' => ')',
'【' | '〔' => '【',
'】' | '〕' => '】',
_ => c,
};
result.push(normalized);
}
result
}
pub fn normalize_punctuation(text: &str) -> String {
let mut result = String::with_capacity(text.len());
for c in text.chars() {
let normalized = match c {
'。' | '。' | '.' => '。',
'、' | '、' | ',' => '、',
'!' | '!' => '!',
'?' | '?' => '?',
':' | ':' => ':',
';' | ';' => ';',
_ => c,
};
result.push(normalized);
}
result
}
pub fn is_hiragana_only(text: &str) -> bool {
text.chars()
.all(|c| ('\u{3040}'..='\u{309F}').contains(&c) || c.is_whitespace())
}
pub fn is_katakana_only(text: &str) -> bool {
text.chars().all(|c| {
('\u{30A0}'..='\u{30FF}').contains(&c)
|| ('\u{FF65}'..='\u{FF9F}').contains(&c) || c.is_whitespace()
})
}
pub fn contains_kanji(text: &str) -> bool {
text.chars().any(|c| {
('\u{4E00}'..='\u{9FFF}').contains(&c) || ('\u{3400}'..='\u{4DBF}').contains(&c) || ('\u{F900}'..='\u{FAFF}').contains(&c) })
}
#[derive(Debug, Clone, Default)]
pub struct CharTypeCounts {
pub hiragana: usize,
pub katakana: usize,
pub kanji: usize,
pub ascii_letter: usize,
pub digit: usize,
pub punctuation: usize,
pub whitespace: usize,
pub other: usize,
}
impl CharTypeCounts {
pub fn from_text(text: &str) -> Self {
let mut counts = Self::default();
for c in text.chars() {
if ('\u{3040}'..='\u{309F}').contains(&c) {
counts.hiragana += 1;
} else if ('\u{30A0}'..='\u{30FF}').contains(&c)
|| ('\u{FF65}'..='\u{FF9F}').contains(&c)
{
counts.katakana += 1;
} else if ('\u{4E00}'..='\u{9FFF}').contains(&c) {
counts.kanji += 1;
} else if c.is_ascii_alphabetic() {
counts.ascii_letter += 1;
} else if c.is_ascii_digit() {
counts.digit += 1;
} else if c.is_ascii_punctuation() || is_cjk_punctuation(c) {
counts.punctuation += 1;
} else if c.is_whitespace() {
counts.whitespace += 1;
} else {
counts.other += 1;
}
}
counts
}
pub fn total(&self) -> usize {
self.hiragana
+ self.katakana
+ self.kanji
+ self.ascii_letter
+ self.digit
+ self.punctuation
+ self.whitespace
+ self.other
}
pub fn japanese_ratio(&self) -> f64 {
let total = self.total();
if total == 0 {
return 0.0;
}
(self.hiragana + self.katakana + self.kanji) as f64 / total as f64
}
}
fn is_cjk_punctuation(c: char) -> bool {
matches!(
c,
'。' | '、'
| '!'
| '?'
| '「'
| '」'
| '『'
| '』'
| '('
| ')'
| '【'
| '】'
| '・'
| ':'
| ';'
| '〜'
| '…'
| '‥'
| '゛'
| '゜'
)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_normalizer_default() {
let norm = Normalizer::new();
let result = norm.normalize(" ABCD 123 ");
assert_eq!(result, "ABCD 123");
}
#[test]
fn test_normalizer_minimal() {
let norm = Normalizer::minimal();
let input = " ABC ";
let result = norm.normalize(input);
assert_eq!(result, input); }
#[test]
fn test_fullwidth_digit_to_half() {
assert_eq!(fullwidth_digit_to_half('\u{FF10}'), Some('0'));
assert_eq!(fullwidth_digit_to_half('\u{FF19}'), Some('9'));
assert_eq!(fullwidth_digit_to_half('a'), None);
}
#[test]
fn test_fullwidth_alpha_to_half() {
assert_eq!(fullwidth_alpha_to_half('\u{FF21}'), Some('A'));
assert_eq!(fullwidth_alpha_to_half('\u{FF3A}'), Some('Z'));
assert_eq!(fullwidth_alpha_to_half('\u{FF41}'), Some('a'));
assert_eq!(fullwidth_alpha_to_half('\u{FF5A}'), Some('z'));
}
#[test]
fn test_halfwidth_kana_to_full() {
assert_eq!(halfwidth_kana_to_full('\u{FF71}'), Some('\u{30A2}')); assert_eq!(halfwidth_kana_to_full('\u{FF9D}'), Some('\u{30F3}')); assert_eq!(halfwidth_kana_to_full('A'), None);
}
#[test]
fn test_is_zero_width() {
assert!(is_zero_width('\u{200B}'));
assert!(is_zero_width('\u{FEFF}'));
assert!(!is_zero_width('a'));
}
#[test]
fn test_remove_zero_width() {
let norm = Normalizer::new();
let input = "te\u{200B}st";
let result = norm.normalize(input);
assert_eq!(result, "test");
}
#[test]
fn test_collapse_whitespace() {
let norm = Normalizer::new();
let result = norm.normalize("a b c");
assert_eq!(result, "a b c");
}
#[test]
fn test_lowercase() {
let norm = Normalizer::new().with_lowercase(true);
let result = norm.normalize("HELLO World");
assert_eq!(result, "hello world");
}
#[test]
fn test_to_nfkc() {
let result = to_nfkc("ABCD123");
assert_eq!(result, "ABCD123");
}
#[test]
fn test_remove_whitespace() {
let result = remove_whitespace("a b c");
assert_eq!(result, "abc");
}
#[test]
fn test_normalize_quotes() {
let result = normalize_quotes("『テスト』");
assert_eq!(result, "「テスト」");
}
#[test]
fn test_normalize_punctuation() {
let result = normalize_punctuation("テスト。これはテスト、です");
assert_eq!(result, "テスト。これはテスト、です");
}
#[test]
fn test_is_hiragana_only() {
assert!(is_hiragana_only("ひらがな"));
assert!(!is_hiragana_only("カタカナ"));
assert!(!is_hiragana_only("漢字"));
}
#[test]
fn test_is_katakana_only() {
assert!(is_katakana_only("カタカナ"));
assert!(!is_katakana_only("ひらがな"));
}
#[test]
fn test_contains_kanji() {
assert!(contains_kanji("漢字テスト"));
assert!(!contains_kanji("ひらがなカタカナ"));
}
#[test]
fn test_char_type_counts() {
let counts = CharTypeCounts::from_text("漢字とひらがなカタカナABC123");
assert_eq!(counts.kanji, 2);
assert_eq!(counts.hiragana, 5); assert_eq!(counts.katakana, 4);
assert_eq!(counts.ascii_letter, 3);
assert_eq!(counts.digit, 3);
}
#[test]
fn test_char_type_japanese_ratio() {
let counts = CharTypeCounts::from_text("あいうえお12345");
let ratio = counts.japanese_ratio();
assert!((ratio - 0.5).abs() < 0.01);
}
#[test]
#[allow(clippy::float_cmp)]
fn test_char_type_counts_empty() {
let counts = CharTypeCounts::from_text("");
assert_eq!(counts.total(), 0);
assert_eq!(counts.japanese_ratio(), 0.0);
}
#[test]
fn test_ideographic_space() {
let norm = Normalizer::new();
let result = norm.normalize("テスト\u{3000}テスト");
assert_eq!(result, "テスト テスト");
}
}