1use unicode_normalization::UnicodeNormalization;
30
31#[derive(Debug, Clone, Copy, PartialEq, Eq)]
33pub enum UnicodeNormalizationForm {
34 Nfc,
35 Nfd,
36 Nfkc,
37 Nfkd,
38}
39
40#[derive(Debug, Clone, Copy, PartialEq, Eq)]
42pub enum WhitespaceMode {
43 Preserve,
44 Collapse,
45 Trim,
46}
47
48#[derive(Debug, Clone, PartialEq, Eq)]
50pub struct NormalizeOptions {
51 pub unicode: Option<UnicodeNormalizationForm>,
52 pub half_width_ascii: bool,
53 pub full_width_ascii: bool,
54 pub hiragana: bool,
55 pub katakana: bool,
56 pub half_width_katakana: bool,
57 pub full_width_katakana: bool,
58 pub combine_dakuten: bool,
59 pub decompose_dakuten: bool,
60 pub punctuation: bool,
61 pub brackets: bool,
62 pub symbols: bool,
63 pub old_kanji: bool,
64 pub remove_variation_selectors: bool,
65 pub expand_iteration_marks: bool,
66 pub whitespace: WhitespaceMode,
67 pub preserve_ascii_tokens: bool,
68}
69
70impl Default for NormalizeOptions {
71 fn default() -> Self {
72 Self {
73 unicode: None,
74 half_width_ascii: true,
75 full_width_ascii: false,
76 hiragana: false,
77 katakana: false,
78 half_width_katakana: true,
79 full_width_katakana: false,
80 combine_dakuten: true,
81 decompose_dakuten: false,
82 punctuation: true,
83 brackets: true,
84 symbols: true,
85 old_kanji: true,
86 remove_variation_selectors: true,
87 expand_iteration_marks: true,
88 whitespace: WhitespaceMode::Collapse,
89 preserve_ascii_tokens: false,
90 }
91 }
92}
93
94#[derive(Debug, Clone, Default, PartialEq, Eq)]
96pub struct Normalizer {
97 options: NormalizeOptions,
98}
99
100impl Normalizer {
101 pub fn new() -> Self {
102 Self::default()
103 }
104
105 pub fn with_options(options: NormalizeOptions) -> Self {
106 Self { options }
107 }
108
109 pub fn unicode(mut self, form: UnicodeNormalizationForm) -> Self {
110 self.options.unicode = Some(form);
111 self
112 }
113
114 pub fn unicode_normalization(mut self, form: Option<UnicodeNormalizationForm>) -> Self {
115 self.options.unicode = form;
116 self
117 }
118
119 pub fn half_width_ascii(mut self, enabled: bool) -> Self {
120 self.options.half_width_ascii = enabled;
121 if enabled {
122 self.options.full_width_ascii = false;
123 }
124 self
125 }
126
127 pub fn full_width_ascii(mut self, enabled: bool) -> Self {
128 self.options.full_width_ascii = enabled;
129 if enabled {
130 self.options.half_width_ascii = false;
131 }
132 self
133 }
134
135 pub fn hiragana(mut self, enabled: bool) -> Self {
136 self.options.hiragana = enabled;
137 if enabled {
138 self.options.katakana = false;
139 }
140 self
141 }
142
143 pub fn katakana(mut self, enabled: bool) -> Self {
144 self.options.katakana = enabled;
145 if enabled {
146 self.options.hiragana = false;
147 }
148 self
149 }
150
151 pub fn half_width_katakana(mut self, enabled: bool) -> Self {
152 self.options.half_width_katakana = enabled;
153 if enabled {
154 self.options.full_width_katakana = false;
155 }
156 self
157 }
158
159 pub fn full_width_katakana(mut self, enabled: bool) -> Self {
160 self.options.full_width_katakana = enabled;
161 if enabled {
162 self.options.half_width_katakana = false;
163 }
164 self
165 }
166
167 pub fn whitespace(mut self, mode: WhitespaceMode) -> Self {
168 self.options.whitespace = mode;
169 self
170 }
171
172 pub fn combine_dakuten(mut self, enabled: bool) -> Self {
173 self.options.combine_dakuten = enabled;
174 if enabled {
175 self.options.decompose_dakuten = false;
176 }
177 self
178 }
179
180 pub fn decompose_dakuten(mut self, enabled: bool) -> Self {
181 self.options.decompose_dakuten = enabled;
182 if enabled {
183 self.options.combine_dakuten = false;
184 }
185 self
186 }
187
188 pub fn punctuation(mut self, enabled: bool) -> Self {
189 self.options.punctuation = enabled;
190 self
191 }
192
193 pub fn brackets(mut self, enabled: bool) -> Self {
194 self.options.brackets = enabled;
195 self
196 }
197
198 pub fn symbols(mut self, enabled: bool) -> Self {
199 self.options.symbols = enabled;
200 self
201 }
202
203 pub fn old_kanji(mut self, enabled: bool) -> Self {
204 self.options.old_kanji = enabled;
205 self
206 }
207
208 pub fn remove_variation_selectors(mut self, enabled: bool) -> Self {
209 self.options.remove_variation_selectors = enabled;
210 self
211 }
212
213 pub fn expand_iteration_marks(mut self, enabled: bool) -> Self {
214 self.options.expand_iteration_marks = enabled;
215 self
216 }
217
218 pub fn preserve_ascii_tokens(mut self, enabled: bool) -> Self {
219 self.options.preserve_ascii_tokens = enabled;
220 self
221 }
222
223 pub fn options(&self) -> &NormalizeOptions {
224 &self.options
225 }
226
227 pub fn normalize(&self, input: &str) -> String {
228 normalize_with_options(input, &self.options)
229 }
230}
231
232pub fn to_half_width(input: &str) -> String {
248 map_chars(input, |c| match c {
249 ' ' => ' ',
250 '\u{FF01}'..='\u{FF5E}' => shift_char(c, 0xFF01, 0x0021),
251 _ => c,
252 })
253}
254
255pub fn to_full_width(input: &str) -> String {
271 map_chars(input, |c| match c {
272 ' ' => ' ',
273 '\u{0021}'..='\u{007E}' => shift_char(c, 0x0021, 0xFF01),
274 _ => c,
275 })
276}
277
278pub fn to_hiragana(input: &str) -> String {
294 let mut result = String::new();
295
296 for c in input.chars() {
297 match c {
298 '\u{30A1}'..='\u{30F6}' => result.push(shift_char(c, 0x30A1, 0x3041)),
299 'ヷ' => result.push_str("わ\u{3099}"),
300 'ヸ' => result.push_str("ゐ\u{3099}"),
301 'ヹ' => result.push_str("ゑ\u{3099}"),
302 'ヺ' => result.push_str("を\u{3099}"),
303 _ => result.push(c),
304 }
305 }
306
307 result
308}
309
310pub fn to_katakana(input: &str) -> String {
326 let mut result = String::new();
327 let mut chars = input.chars().peekable();
328
329 while let Some(c) = chars.next() {
330 match chars.peek().copied() {
331 Some('\u{3099}') => {
332 if let Some(voiced) = voiced_hiragana_to_katakana(c) {
333 result.push(voiced);
334 chars.next();
335 continue;
336 }
337 }
338 Some('\u{309A}') => {
339 if let Some(semi_voiced) = semi_voiced_hiragana_to_katakana(c) {
340 result.push(semi_voiced);
341 chars.next();
342 continue;
343 }
344 }
345 _ => {}
346 }
347
348 match c {
349 '\u{3041}'..='\u{3096}' => result.push(shift_char(c, 0x3041, 0x30A1)),
350 _ => result.push(c),
351 }
352 }
353
354 result
355}
356
357pub fn full_width_katakana_to_half_width(input: &str) -> String {
370 let mut result = String::new();
371
372 for c in input.chars() {
373 let half = full_width_katakana_char_to_half_width(c);
374 if half.is_empty() {
375 result.push(c);
376 } else {
377 result.push_str(half);
378 }
379 }
380
381 result
382}
383
384pub fn combine_dakuten(input: &str) -> String {
393 let mut result = String::new();
394 let mut chars = input.chars().peekable();
395
396 while let Some(c) = chars.next() {
397 match chars.peek().copied() {
398 Some('\u{3099}') => {
399 if let Some(voiced) = compose_dakuten(c) {
400 result.push(voiced);
401 chars.next();
402 continue;
403 }
404 }
405 Some('\u{309A}') => {
406 if let Some(semi_voiced) = compose_handakuten(c) {
407 result.push(semi_voiced);
408 chars.next();
409 continue;
410 }
411 }
412 _ => {}
413 }
414
415 result.push(c);
416 }
417
418 result
419}
420
421pub fn decompose_dakuten(input: &str) -> String {
430 let mut result = String::new();
431
432 for c in input.chars() {
433 if let Some((base, mark)) = decompose_dakuten_char(c) {
434 result.push(base);
435 result.push(mark);
436 } else {
437 result.push(c);
438 }
439 }
440
441 result
442}
443
444pub fn normalize_nfc(input: &str) -> String {
446 input.nfc().collect()
447}
448
449pub fn normalize_nfd(input: &str) -> String {
451 input.nfd().collect()
452}
453
454pub fn normalize_nfkc(input: &str) -> String {
456 input.nfkc().collect()
457}
458
459pub fn normalize_nfkd(input: &str) -> String {
461 input.nfkd().collect()
462}
463
464pub fn normalize_punctuation(input: &str) -> String {
472 map_chars(input, |c| match c {
473 ',' | ',' | '、' => '、',
474 '.' | '.' | '。' => '。',
475 _ => c,
476 })
477}
478
479pub fn normalize_brackets_and_quotes(input: &str) -> String {
487 let mut result = String::new();
488 let mut double_quote_open = true;
489 let mut single_quote_open = true;
490
491 for c in input.chars() {
492 match c {
493 '(' | '(' | '[' | '[' => result.push('('),
494 ')' | ')' | ']' | ']' => result.push(')'),
495 '"' => {
496 result.push(if double_quote_open { '「' } else { '」' });
497 double_quote_open = !double_quote_open;
498 }
499 '“' | '〝' => result.push('「'),
500 '”' | '〟' => result.push('」'),
501 '\'' => {
502 result.push(if single_quote_open { '『' } else { '』' });
503 single_quote_open = !single_quote_open;
504 }
505 '‘' => result.push('『'),
506 '’' => result.push('』'),
507 _ => result.push(c),
508 }
509 }
510
511 result
512}
513
514pub fn normalize_symbols(input: &str) -> String {
522 map_chars(input, |c| match c {
523 '〜' | '~' => 'ー',
524 '‐' | '‑' | '‒' | '–' | '—' | '―' | '−' | '﹣' | '-' => '-',
525 _ => c,
526 })
527}
528
529pub fn old_kanji_to_new(input: &str) -> String {
537 map_chars(input, old_kanji_char_to_new)
538}
539
540pub fn remove_variation_selectors(input: &str) -> String {
548 input
549 .chars()
550 .filter(|&c| !is_variation_selector(c))
551 .collect()
552}
553
554pub fn normalize(input: &str) -> String {
556 normalize_with_options(input, &NormalizeOptions::default())
557}
558
559pub fn normalize_with_options(input: &str, options: &NormalizeOptions) -> String {
561 if options.preserve_ascii_tokens {
562 return normalize_preserving_ascii_tokens(input, options);
563 }
564
565 normalize_segment(input, options)
566}
567
568pub fn is_hiragana(c: char) -> bool {
580 matches!(c, '\u{3041}'..='\u{3096}')
581}
582
583pub fn is_katakana(c: char) -> bool {
597 matches!(c, '\u{30A1}'..='\u{30FA}' | 'ー')
598}
599
600pub fn is_half_width_katakana(c: char) -> bool {
613 matches!(c, '\u{FF66}'..='\u{FF9F}')
614}
615
616pub fn is_kanji(c: char) -> bool {
629 matches!(c, '\u{4E00}'..='\u{9FFF}')
630}
631
632pub fn is_full_width(c: char) -> bool {
646 is_hiragana(c)
647 || is_katakana(c)
648 || is_kanji(c)
649 || matches!(
650 c,
651 ' '
652 | '\u{3000}'..='\u{303F}'
653 | '\u{30A0}'..='\u{30FF}'
654 | '\u{FF01}'..='\u{FF5E}'
655 | '\u{FFE0}'..='\u{FFE6}'
656 )
657}
658
659#[derive(Debug, Clone, Default, PartialEq, Eq)]
673pub struct CharacterTypes {
674 pub hiragana: usize,
675 pub katakana: usize,
676 pub half_width_katakana: usize,
677 pub kanji: usize,
678 pub ascii: usize,
679 pub full_width: usize,
680 pub other: usize,
681}
682
683#[derive(Debug, Clone, Default, PartialEq)]
685pub struct CharacterTypeRatios {
686 pub hiragana: f64,
687 pub katakana: f64,
688 pub half_width_katakana: f64,
689 pub kanji: f64,
690 pub ascii: f64,
691 pub full_width: f64,
692 pub other: f64,
693}
694
695pub fn count_character_types(input: &str) -> CharacterTypes {
696 let mut counts = CharacterTypes::default();
697
698 for c in input.chars() {
699 if is_hiragana(c) {
700 counts.hiragana += 1;
701 } else if is_katakana(c) {
702 counts.katakana += 1;
703 } else if is_half_width_katakana(c) {
704 counts.half_width_katakana += 1;
705 } else if is_kanji(c) {
706 counts.kanji += 1;
707 } else if c.is_ascii() {
708 counts.ascii += 1;
709 } else if is_full_width(c) {
710 counts.full_width += 1;
711 } else {
712 counts.other += 1;
713 }
714 }
715
716 counts
717}
718
719pub fn character_type_ratios(input: &str) -> CharacterTypeRatios {
721 let counts = count_character_types(input);
722 let total = input.chars().count() as f64;
723
724 if total == 0.0 {
725 return CharacterTypeRatios::default();
726 }
727
728 CharacterTypeRatios {
729 hiragana: counts.hiragana as f64 / total,
730 katakana: counts.katakana as f64 / total,
731 half_width_katakana: counts.half_width_katakana as f64 / total,
732 kanji: counts.kanji as f64 / total,
733 ascii: counts.ascii as f64 / total,
734 full_width: counts.full_width as f64 / total,
735 other: counts.other as f64 / total,
736 }
737}
738
739pub fn is_mostly_japanese(input: &str, threshold: f64) -> bool {
741 let total = input.chars().count();
742 if total == 0 {
743 return false;
744 }
745
746 let counts = count_character_types(input);
747 let japanese = counts.hiragana + counts.katakana + counts.half_width_katakana + counts.kanji;
748 japanese as f64 / total as f64 >= threshold
749}
750
751pub fn has_mixed_scripts(input: &str) -> bool {
753 let counts = count_character_types(input);
754 [
755 counts.hiragana,
756 counts.katakana,
757 counts.half_width_katakana,
758 counts.kanji,
759 counts.ascii,
760 ]
761 .into_iter()
762 .filter(|&count| count > 0)
763 .count()
764 > 1
765}
766
767pub fn extract_japanese(input: &str) -> String {
769 input
770 .chars()
771 .filter(|&c| is_hiragana(c) || is_katakana(c) || is_half_width_katakana(c) || is_kanji(c))
772 .collect()
773}
774
775pub fn extract_ascii(input: &str) -> String {
777 input.chars().filter(|c| c.is_ascii()).collect()
778}
779
780pub fn remove_symbols(input: &str) -> String {
782 input
783 .chars()
784 .filter(|&c| !is_symbol_or_punctuation(c))
785 .collect()
786}
787
788pub fn normalize_whitespace(input: &str) -> String {
799 map_chars(input, |c| {
800 if c.is_whitespace() || c == ' ' {
801 ' '
802 } else {
803 c
804 }
805 })
806 .split_whitespace()
807 .collect::<Vec<_>>()
808 .join(" ")
809}
810
811pub fn half_width_katakana_to_full_width(input: &str) -> String {
825 let mut result = String::new();
826 let mut chars = input.chars().peekable();
827
828 while let Some(c) = chars.next() {
829 let converted = match chars.peek().copied() {
830 Some('゙') => voiced_half_width_katakana(c),
831 Some('゚') => semi_voiced_half_width_katakana(c),
832 _ => None,
833 };
834
835 if let Some(full) = converted {
836 result.push(full);
837 chars.next();
838 } else {
839 result.push(half_width_katakana_char_to_full_width(c));
840 }
841 }
842
843 result
844}
845
846pub fn normalize_prolonged_sound(input: &str) -> String {
857 map_chars(input, |c| match c {
858 '〜' | '~' => 'ー',
859 _ => c,
860 })
861}
862
863pub fn expand_iteration_marks(input: &str) -> String {
876 let mut result = String::new();
877
878 for c in input.chars() {
879 match c {
880 'ゝ' => {
882 if let Some(prev) = result.chars().last() {
883 result.push(prev);
884 } else {
885 result.push(c);
886 }
887 }
888 'ゞ' => {
890 if let Some(prev) = result.chars().last() {
891 let voiced = add_dakuten(prev);
892 result.push(voiced);
893 } else {
894 result.push(c);
895 }
896 }
897 'ヽ' => {
899 if let Some(prev) = result.chars().last() {
900 result.push(prev);
901 } else {
902 result.push(c);
903 }
904 }
905 'ヾ' => {
907 if let Some(prev) = result.chars().last() {
908 let voiced = add_dakuten(prev);
909 result.push(voiced);
910 } else {
911 result.push(c);
912 }
913 }
914 _ => result.push(c),
915 }
916 }
917
918 result
919}
920
921fn normalize_segment(input: &str, options: &NormalizeOptions) -> String {
922 let mut text = match options.unicode {
923 Some(UnicodeNormalizationForm::Nfc) => normalize_nfc(input),
924 Some(UnicodeNormalizationForm::Nfd) => normalize_nfd(input),
925 Some(UnicodeNormalizationForm::Nfkc) => normalize_nfkc(input),
926 Some(UnicodeNormalizationForm::Nfkd) => normalize_nfkd(input),
927 None => input.to_string(),
928 };
929
930 if options.remove_variation_selectors {
931 text = remove_variation_selectors(&text);
932 }
933 if options.half_width_katakana {
934 text = half_width_katakana_to_full_width(&text);
935 }
936 if options.hiragana {
937 text = to_hiragana(&text);
938 }
939 if options.katakana {
940 text = to_katakana(&text);
941 }
942 if options.decompose_dakuten {
943 text = decompose_dakuten(&text);
944 } else if options.combine_dakuten {
945 text = combine_dakuten(&text);
946 }
947 if options.full_width_katakana {
948 text = full_width_katakana_to_half_width(&text);
949 }
950 if options.symbols {
951 text = normalize_symbols(&text);
952 }
953 if options.half_width_ascii {
954 text = to_half_width(&text);
955 }
956 if options.full_width_ascii {
957 text = to_full_width(&text);
958 }
959 if options.punctuation {
960 text = normalize_punctuation(&text);
961 }
962 if options.brackets {
963 text = normalize_brackets_and_quotes(&text);
964 }
965 if options.old_kanji {
966 text = old_kanji_to_new(&text);
967 }
968 if options.expand_iteration_marks {
969 text = expand_iteration_marks(&text);
970 }
971
972 match options.whitespace {
973 WhitespaceMode::Preserve => text,
974 WhitespaceMode::Collapse => normalize_whitespace(&text),
975 WhitespaceMode::Trim => text.trim().to_string(),
976 }
977}
978
979fn normalize_preserving_ascii_tokens(input: &str, options: &NormalizeOptions) -> String {
980 let mut result = String::new();
981 let mut ascii_run = String::new();
982 let mut normal_run = String::new();
983
984 for c in input.chars() {
985 if c.is_ascii() && !c.is_ascii_whitespace() {
986 push_normalized_segment(&mut result, &normal_run, options);
987 normal_run.clear();
988 ascii_run.push(c);
989 } else {
990 push_normalized_or_preserved_token(&mut result, &ascii_run, options);
991 ascii_run.clear();
992 normal_run.push(c);
993 }
994 }
995
996 push_normalized_or_preserved_token(&mut result, &ascii_run, options);
997 push_normalized_segment(&mut result, &normal_run, options);
998
999 match options.whitespace {
1000 WhitespaceMode::Preserve => result,
1001 WhitespaceMode::Collapse => normalize_whitespace(&result),
1002 WhitespaceMode::Trim => result.trim().to_string(),
1003 }
1004}
1005
1006fn push_normalized_segment(result: &mut String, segment: &str, options: &NormalizeOptions) {
1007 if segment.is_empty() {
1008 return;
1009 }
1010
1011 let mut segment_options = options.clone();
1012 segment_options.preserve_ascii_tokens = false;
1013 segment_options.whitespace = WhitespaceMode::Preserve;
1014 result.push_str(&normalize_segment(segment, &segment_options));
1015}
1016
1017fn push_normalized_or_preserved_token(
1018 result: &mut String,
1019 token: &str,
1020 options: &NormalizeOptions,
1021) {
1022 if token.is_empty() {
1023 return;
1024 }
1025
1026 if let Some((leading, preserved, trailing)) = split_preserved_ascii_token(token) {
1027 push_normalized_segment(result, leading, options);
1028 result.push_str(preserved);
1029 push_normalized_segment(result, trailing, options);
1030 } else {
1031 push_normalized_segment(result, token, options);
1032 }
1033}
1034
1035fn split_preserved_ascii_token(token: &str) -> Option<(&str, &str, &str)> {
1036 if is_number_like(token) {
1037 return Some(("", token, ""));
1038 }
1039
1040 let leading_start = token
1041 .char_indices()
1042 .find(|&(_, c)| !is_ascii_token_leading_delimiter(c))
1043 .map(|(idx, _)| idx)
1044 .unwrap_or(token.len());
1045 let (leading, rest) = token.split_at(leading_start);
1046
1047 let mut core_end = rest.len();
1048 while core_end > 0 {
1049 let mut chars = rest[..core_end].char_indices();
1050 let Some((idx, c)) = chars.next_back() else {
1051 break;
1052 };
1053
1054 if is_ascii_token_trailing_delimiter(c) {
1055 core_end = idx;
1056 } else {
1057 break;
1058 }
1059 }
1060
1061 let candidate = &rest[..core_end];
1062
1063 if let Some((preserved_start, preserved_end)) = find_preserved_ascii_core(candidate) {
1064 let preserved_start = leading.len() + preserved_start;
1065 let preserved_end = leading.len() + preserved_end;
1066 Some((
1067 &token[..preserved_start],
1068 &token[preserved_start..preserved_end],
1069 &token[preserved_end..],
1070 ))
1071 } else {
1072 None
1073 }
1074}
1075
1076fn is_url_like(token: &str) -> bool {
1077 token.starts_with("http://") || token.starts_with("https://")
1078}
1079
1080fn find_preserved_ascii_core(token: &str) -> Option<(usize, usize)> {
1081 if is_url_like(token) || is_email_like(token) || is_number_like(token) {
1082 return Some((0, token.len()));
1083 }
1084
1085 let url_start = match (token.find("http://"), token.find("https://")) {
1086 (Some(http), Some(https)) => Some(http.min(https)),
1087 (Some(http), None) => Some(http),
1088 (None, Some(https)) => Some(https),
1089 (None, None) => None,
1090 };
1091 if let Some(start) = url_start {
1092 return Some((start, token.len()));
1093 }
1094
1095 token
1096 .char_indices()
1097 .find_map(|(start, _)| is_email_like(&token[start..]).then_some((start, token.len())))
1098}
1099
1100fn is_ascii_token_leading_delimiter(c: char) -> bool {
1101 matches!(
1102 c,
1103 '(' | '[' | '{' | '<' | '"' | '\'' | '(' | '[' | '{' | '「' | '『'
1104 )
1105}
1106
1107fn is_ascii_token_trailing_delimiter(c: char) -> bool {
1108 matches!(
1109 c,
1110 ')' | ']'
1111 | '}'
1112 | '>'
1113 | '"'
1114 | '\''
1115 | ','
1116 | '.'
1117 | ','
1118 | '.'
1119 | '、'
1120 | '。'
1121 | ')'
1122 | ']'
1123 | '}'
1124 | '」'
1125 | '』'
1126 )
1127}
1128
1129fn is_email_like(token: &str) -> bool {
1130 let Some((local, domain)) = token.split_once('@') else {
1131 return false;
1132 };
1133
1134 !local.is_empty()
1135 && domain.contains('.')
1136 && domain.len() >= 3
1137 && token
1138 .chars()
1139 .all(|c| c.is_ascii_alphanumeric() || matches!(c, '@' | '.' | '_' | '%' | '+' | '-'))
1140}
1141
1142fn is_number_like(token: &str) -> bool {
1143 let mut has_digit = false;
1144
1145 for c in token.chars() {
1146 if c.is_ascii_digit() {
1147 has_digit = true;
1148 } else if !matches!(c, '.' | ',' | ':' | '/' | '-' | '+' | '%' | '_' | '#') {
1149 return false;
1150 }
1151 }
1152
1153 has_digit
1154}
1155
1156fn map_chars(input: &str, convert: impl Fn(char) -> char) -> String {
1157 input.chars().map(convert).collect()
1158}
1159
1160fn shift_char(c: char, from_start: u32, to_start: u32) -> char {
1161 char::from_u32(c as u32 - from_start + to_start).unwrap_or(c)
1162}
1163
1164fn half_width_katakana_char_to_full_width(c: char) -> char {
1165 match c {
1166 'ヲ' => 'ヲ',
1167 'ァ' => 'ァ',
1168 'ィ' => 'ィ',
1169 'ゥ' => 'ゥ',
1170 'ェ' => 'ェ',
1171 'ォ' => 'ォ',
1172 'ャ' => 'ャ',
1173 'ュ' => 'ュ',
1174 'ョ' => 'ョ',
1175 'ッ' => 'ッ',
1176 'ー' => 'ー',
1177 'ア' => 'ア',
1178 'イ' => 'イ',
1179 'ウ' => 'ウ',
1180 'エ' => 'エ',
1181 'オ' => 'オ',
1182 'カ' => 'カ',
1183 'キ' => 'キ',
1184 'ク' => 'ク',
1185 'ケ' => 'ケ',
1186 'コ' => 'コ',
1187 'サ' => 'サ',
1188 'シ' => 'シ',
1189 'ス' => 'ス',
1190 'セ' => 'セ',
1191 'ソ' => 'ソ',
1192 'タ' => 'タ',
1193 'チ' => 'チ',
1194 'ツ' => 'ツ',
1195 'テ' => 'テ',
1196 'ト' => 'ト',
1197 'ナ' => 'ナ',
1198 'ニ' => 'ニ',
1199 'ヌ' => 'ヌ',
1200 'ネ' => 'ネ',
1201 'ノ' => 'ノ',
1202 'ハ' => 'ハ',
1203 'ヒ' => 'ヒ',
1204 'フ' => 'フ',
1205 'ヘ' => 'ヘ',
1206 'ホ' => 'ホ',
1207 'マ' => 'マ',
1208 'ミ' => 'ミ',
1209 'ム' => 'ム',
1210 'メ' => 'メ',
1211 'モ' => 'モ',
1212 'ヤ' => 'ヤ',
1213 'ユ' => 'ユ',
1214 'ヨ' => 'ヨ',
1215 'ラ' => 'ラ',
1216 'リ' => 'リ',
1217 'ル' => 'ル',
1218 'レ' => 'レ',
1219 'ロ' => 'ロ',
1220 'ワ' => 'ワ',
1221 'ン' => 'ン',
1222 '。' => '。',
1223 '「' => '「',
1224 '」' => '」',
1225 '、' => '、',
1226 '・' => '・',
1227 _ => c,
1228 }
1229}
1230
1231fn full_width_katakana_char_to_half_width(c: char) -> &'static str {
1232 match c {
1233 'ヲ' => "ヲ",
1234 'ァ' => "ァ",
1235 'ィ' => "ィ",
1236 'ゥ' => "ゥ",
1237 'ェ' => "ェ",
1238 'ォ' => "ォ",
1239 'ャ' => "ャ",
1240 'ュ' => "ュ",
1241 'ョ' => "ョ",
1242 'ッ' => "ッ",
1243 'ー' => "ー",
1244 'ア' => "ア",
1245 'イ' => "イ",
1246 'ウ' => "ウ",
1247 'エ' => "エ",
1248 'オ' => "オ",
1249 'カ' => "カ",
1250 'キ' => "キ",
1251 'ク' => "ク",
1252 'ケ' => "ケ",
1253 'コ' => "コ",
1254 'サ' => "サ",
1255 'シ' => "シ",
1256 'ス' => "ス",
1257 'セ' => "セ",
1258 'ソ' => "ソ",
1259 'タ' => "タ",
1260 'チ' => "チ",
1261 'ツ' => "ツ",
1262 'テ' => "テ",
1263 'ト' => "ト",
1264 'ナ' => "ナ",
1265 'ニ' => "ニ",
1266 'ヌ' => "ヌ",
1267 'ネ' => "ネ",
1268 'ノ' => "ノ",
1269 'ハ' => "ハ",
1270 'ヒ' => "ヒ",
1271 'フ' => "フ",
1272 'ヘ' => "ヘ",
1273 'ホ' => "ホ",
1274 'マ' => "マ",
1275 'ミ' => "ミ",
1276 'ム' => "ム",
1277 'メ' => "メ",
1278 'モ' => "モ",
1279 'ヤ' => "ヤ",
1280 'ユ' => "ユ",
1281 'ヨ' => "ヨ",
1282 'ラ' => "ラ",
1283 'リ' => "リ",
1284 'ル' => "ル",
1285 'レ' => "レ",
1286 'ロ' => "ロ",
1287 'ワ' => "ワ",
1288 'ン' => "ン",
1289 'ヷ' => "ヷ",
1290 'ヸ' => "イ゙",
1291 'ヹ' => "エ゙",
1292 'ヺ' => "ヺ",
1293 'ガ' => "ガ",
1294 'ギ' => "ギ",
1295 'グ' => "グ",
1296 'ゲ' => "ゲ",
1297 'ゴ' => "ゴ",
1298 'ザ' => "ザ",
1299 'ジ' => "ジ",
1300 'ズ' => "ズ",
1301 'ゼ' => "ゼ",
1302 'ゾ' => "ゾ",
1303 'ダ' => "ダ",
1304 'ヂ' => "ヂ",
1305 'ヅ' => "ヅ",
1306 'デ' => "デ",
1307 'ド' => "ド",
1308 'バ' => "バ",
1309 'ビ' => "ビ",
1310 'ブ' => "ブ",
1311 'ベ' => "ベ",
1312 'ボ' => "ボ",
1313 'ヴ' => "ヴ",
1314 'パ' => "パ",
1315 'ピ' => "ピ",
1316 'プ' => "プ",
1317 'ペ' => "ペ",
1318 'ポ' => "ポ",
1319 '。' => "。",
1320 '「' => "「",
1321 '」' => "」",
1322 '、' => "、",
1323 '・' => "・",
1324 _ => "",
1325 }
1326}
1327
1328fn voiced_half_width_katakana(c: char) -> Option<char> {
1329 Some(match c {
1330 'カ' => 'ガ',
1331 'キ' => 'ギ',
1332 'ク' => 'グ',
1333 'ケ' => 'ゲ',
1334 'コ' => 'ゴ',
1335 'サ' => 'ザ',
1336 'シ' => 'ジ',
1337 'ス' => 'ズ',
1338 'セ' => 'ゼ',
1339 'ソ' => 'ゾ',
1340 'タ' => 'ダ',
1341 'チ' => 'ヂ',
1342 'ツ' => 'ヅ',
1343 'テ' => 'デ',
1344 'ト' => 'ド',
1345 'ハ' => 'バ',
1346 'ヒ' => 'ビ',
1347 'フ' => 'ブ',
1348 'ヘ' => 'ベ',
1349 'ホ' => 'ボ',
1350 'ウ' => 'ヴ',
1351 'ワ' => 'ヷ',
1352 'イ' => 'ヸ',
1353 'エ' => 'ヹ',
1354 'ヲ' => 'ヺ',
1355 _ => return None,
1356 })
1357}
1358
1359fn semi_voiced_half_width_katakana(c: char) -> Option<char> {
1360 Some(match c {
1361 'ハ' => 'パ',
1362 'ヒ' => 'ピ',
1363 'フ' => 'プ',
1364 'ヘ' => 'ペ',
1365 'ホ' => 'ポ',
1366 _ => return None,
1367 })
1368}
1369
1370fn voiced_hiragana_to_katakana(c: char) -> Option<char> {
1371 Some(match c {
1372 'か' => 'ガ',
1373 'き' => 'ギ',
1374 'く' => 'グ',
1375 'け' => 'ゲ',
1376 'こ' => 'ゴ',
1377 'さ' => 'ザ',
1378 'し' => 'ジ',
1379 'す' => 'ズ',
1380 'せ' => 'ゼ',
1381 'そ' => 'ゾ',
1382 'た' => 'ダ',
1383 'ち' => 'ヂ',
1384 'つ' => 'ヅ',
1385 'て' => 'デ',
1386 'と' => 'ド',
1387 'は' => 'バ',
1388 'ひ' => 'ビ',
1389 'ふ' => 'ブ',
1390 'へ' => 'ベ',
1391 'ほ' => 'ボ',
1392 'う' => 'ヴ',
1393 'わ' => 'ヷ',
1394 'ゐ' => 'ヸ',
1395 'ゑ' => 'ヹ',
1396 'を' => 'ヺ',
1397 _ => return None,
1398 })
1399}
1400
1401fn semi_voiced_hiragana_to_katakana(c: char) -> Option<char> {
1402 Some(match c {
1403 'は' => 'パ',
1404 'ひ' => 'ピ',
1405 'ふ' => 'プ',
1406 'へ' => 'ペ',
1407 'ほ' => 'ポ',
1408 _ => return None,
1409 })
1410}
1411
1412fn compose_dakuten(c: char) -> Option<char> {
1413 let voiced = add_dakuten(c);
1414 (voiced != c).then_some(voiced)
1415}
1416
1417fn compose_handakuten(c: char) -> Option<char> {
1418 Some(match c {
1419 'は' => 'ぱ',
1420 'ひ' => 'ぴ',
1421 'ふ' => 'ぷ',
1422 'へ' => 'ぺ',
1423 'ほ' => 'ぽ',
1424 'ハ' => 'パ',
1425 'ヒ' => 'ピ',
1426 'フ' => 'プ',
1427 'ヘ' => 'ペ',
1428 'ホ' => 'ポ',
1429 _ => return None,
1430 })
1431}
1432
1433fn decompose_dakuten_char(c: char) -> Option<(char, char)> {
1434 Some(match c {
1435 'が' => ('か', '\u{3099}'),
1436 'ぎ' => ('き', '\u{3099}'),
1437 'ぐ' => ('く', '\u{3099}'),
1438 'げ' => ('け', '\u{3099}'),
1439 'ご' => ('こ', '\u{3099}'),
1440 'ざ' => ('さ', '\u{3099}'),
1441 'じ' => ('し', '\u{3099}'),
1442 'ず' => ('す', '\u{3099}'),
1443 'ぜ' => ('せ', '\u{3099}'),
1444 'ぞ' => ('そ', '\u{3099}'),
1445 'だ' => ('た', '\u{3099}'),
1446 'ぢ' => ('ち', '\u{3099}'),
1447 'づ' => ('つ', '\u{3099}'),
1448 'で' => ('て', '\u{3099}'),
1449 'ど' => ('と', '\u{3099}'),
1450 'ば' => ('は', '\u{3099}'),
1451 'び' => ('ひ', '\u{3099}'),
1452 'ぶ' => ('ふ', '\u{3099}'),
1453 'べ' => ('へ', '\u{3099}'),
1454 'ぼ' => ('ほ', '\u{3099}'),
1455 'ゔ' => ('う', '\u{3099}'),
1456 'ぱ' => ('は', '\u{309A}'),
1457 'ぴ' => ('ひ', '\u{309A}'),
1458 'ぷ' => ('ふ', '\u{309A}'),
1459 'ぺ' => ('へ', '\u{309A}'),
1460 'ぽ' => ('ほ', '\u{309A}'),
1461 'ガ' => ('カ', '\u{3099}'),
1462 'ギ' => ('キ', '\u{3099}'),
1463 'グ' => ('ク', '\u{3099}'),
1464 'ゲ' => ('ケ', '\u{3099}'),
1465 'ゴ' => ('コ', '\u{3099}'),
1466 'ザ' => ('サ', '\u{3099}'),
1467 'ジ' => ('シ', '\u{3099}'),
1468 'ズ' => ('ス', '\u{3099}'),
1469 'ゼ' => ('セ', '\u{3099}'),
1470 'ゾ' => ('ソ', '\u{3099}'),
1471 'ダ' => ('タ', '\u{3099}'),
1472 'ヂ' => ('チ', '\u{3099}'),
1473 'ヅ' => ('ツ', '\u{3099}'),
1474 'デ' => ('テ', '\u{3099}'),
1475 'ド' => ('ト', '\u{3099}'),
1476 'バ' => ('ハ', '\u{3099}'),
1477 'ビ' => ('ヒ', '\u{3099}'),
1478 'ブ' => ('フ', '\u{3099}'),
1479 'ベ' => ('ヘ', '\u{3099}'),
1480 'ボ' => ('ホ', '\u{3099}'),
1481 'ヴ' => ('ウ', '\u{3099}'),
1482 'ヷ' => ('ワ', '\u{3099}'),
1483 'ヸ' => ('ヰ', '\u{3099}'),
1484 'ヹ' => ('ヱ', '\u{3099}'),
1485 'ヺ' => ('ヲ', '\u{3099}'),
1486 'パ' => ('ハ', '\u{309A}'),
1487 'ピ' => ('ヒ', '\u{309A}'),
1488 'プ' => ('フ', '\u{309A}'),
1489 'ペ' => ('ヘ', '\u{309A}'),
1490 'ポ' => ('ホ', '\u{309A}'),
1491 _ => return None,
1492 })
1493}
1494
1495fn add_dakuten(c: char) -> char {
1497 match c {
1498 'か' => 'が',
1500 'き' => 'ぎ',
1501 'く' => 'ぐ',
1502 'け' => 'げ',
1503 'こ' => 'ご',
1504 'さ' => 'ざ',
1505 'し' => 'じ',
1506 'す' => 'ず',
1507 'せ' => 'ぜ',
1508 'そ' => 'ぞ',
1509 'た' => 'だ',
1510 'ち' => 'ぢ',
1511 'つ' => 'づ',
1512 'て' => 'で',
1513 'と' => 'ど',
1514 'う' => 'ゔ',
1515 'は' => 'ば',
1516 'ひ' => 'び',
1517 'ふ' => 'ぶ',
1518 'へ' => 'べ',
1519 'ほ' => 'ぼ',
1520 'カ' => 'ガ',
1522 'キ' => 'ギ',
1523 'ク' => 'グ',
1524 'ケ' => 'ゲ',
1525 'コ' => 'ゴ',
1526 'サ' => 'ザ',
1527 'シ' => 'ジ',
1528 'ス' => 'ズ',
1529 'セ' => 'ゼ',
1530 'ソ' => 'ゾ',
1531 'タ' => 'ダ',
1532 'チ' => 'ヂ',
1533 'ツ' => 'ヅ',
1534 'テ' => 'デ',
1535 'ト' => 'ド',
1536 'ウ' => 'ヴ',
1537 'ワ' => 'ヷ',
1538 'ヰ' => 'ヸ',
1539 'ヱ' => 'ヹ',
1540 'ヲ' => 'ヺ',
1541 'ハ' => 'バ',
1542 'ヒ' => 'ビ',
1543 'フ' => 'ブ',
1544 'ヘ' => 'ベ',
1545 'ホ' => 'ボ',
1546 _ => c,
1547 }
1548}
1549
1550fn old_kanji_char_to_new(c: char) -> char {
1551 match c {
1552 '亞' => '亜',
1553 '惡' => '悪',
1554 '壓' => '圧',
1555 '圍' => '囲',
1556 '爲' => '為',
1557 '醫' => '医',
1558 '壹' => '壱',
1559 '稻' => '稲',
1560 '飮' => '飲',
1561 '隱' => '隠',
1562 '營' => '営',
1563 '榮' => '栄',
1564 '驛' => '駅',
1565 '圓' => '円',
1566 '鹽' => '塩',
1567 '奧' => '奥',
1568 '應' => '応',
1569 '歐' => '欧',
1570 '毆' => '殴',
1571 '櫻' => '桜',
1572 '假' => '仮',
1573 '價' => '価',
1574 '畫' => '画',
1575 '會' => '会',
1576 '懷' => '懐',
1577 '壞' => '壊',
1578 '樂' => '楽',
1579 '氣' => '気',
1580 '龜' => '亀',
1581 '僞' => '偽',
1582 '舊' => '旧',
1583 '據' => '拠',
1584 '擧' => '挙',
1585 '峽' => '峡',
1586 '狹' => '狭',
1587 '區' => '区',
1588 '驅' => '駆',
1589 '徑' => '径',
1590 '莖' => '茎',
1591 '惠' => '恵',
1592 '溪' => '渓',
1593 '經' => '経',
1594 '繼' => '継',
1595 '缺' => '欠',
1596 '劍' => '剣',
1597 '檢' => '検',
1598 '權' => '権',
1599 '獻' => '献',
1600 '縣' => '県',
1601 '險' => '険',
1602 '嚴' => '厳',
1603 '廣' => '広',
1604 '鑛' => '鉱',
1605 '號' => '号',
1606 '國' => '国',
1607 '黑' => '黒',
1608 '濟' => '済',
1609 '齋' => '斎',
1610 '劑' => '剤',
1611 '雜' => '雑',
1612 '參' => '参',
1613 '棧' => '桟',
1614 '蠶' => '蚕',
1615 '殘' => '残',
1616 '絲' => '糸',
1617 '齒' => '歯',
1618 '兒' => '児',
1619 '實' => '実',
1620 '舍' => '舎',
1621 '寫' => '写',
1622 '釋' => '釈',
1623 '壽' => '寿',
1624 '從' => '従',
1625 '澁' => '渋',
1626 '獸' => '獣',
1627 '縱' => '縦',
1628 '肅' => '粛',
1629 '處' => '処',
1630 '敍' => '叙',
1631 '將' => '将',
1632 '稱' => '称',
1633 '證' => '証',
1634 '奬' => '奨',
1635 '條' => '条',
1636 '乘' => '乗',
1637 '淨' => '浄',
1638 '剩' => '剰',
1639 '疊' => '畳',
1640 '讓' => '譲',
1641 '釀' => '醸',
1642 '眞' => '真',
1643 '寢' => '寝',
1644 '愼' => '慎',
1645 '盡' => '尽',
1646 '圖' => '図',
1647 '粹' => '粋',
1648 '醉' => '酔',
1649 '穗' => '穂',
1650 '隨' => '随',
1651 '髓' => '髄',
1652 '數' => '数',
1653 '聲' => '声',
1654 '靜' => '静',
1655 '齊' => '斉',
1656 '攝' => '摂',
1657 '竊' => '窃',
1658 '專' => '専',
1659 '戰' => '戦',
1660 '淺' => '浅',
1661 '潛' => '潜',
1662 '遷' => '遷',
1663 '踐' => '践',
1664 '錢' => '銭',
1665 '禪' => '禅',
1666 '雙' => '双',
1667 '壯' => '壮',
1668 '爭' => '争',
1669 '莊' => '荘',
1670 '搜' => '捜',
1671 '插' => '挿',
1672 '巢' => '巣',
1673 '裝' => '装',
1674 '總' => '総',
1675 '騷' => '騒',
1676 '臟' => '臓',
1677 '藏' => '蔵',
1678 '屬' => '属',
1679 '續' => '続',
1680 '墮' => '堕',
1681 '對' => '対',
1682 '體' => '体',
1683 '帶' => '帯',
1684 '滯' => '滞',
1685 '臺' => '台',
1686 '瀧' => '滝',
1687 '擇' => '択',
1688 '澤' => '沢',
1689 '單' => '単',
1690 '膽' => '胆',
1691 '團' => '団',
1692 '彈' => '弾',
1693 '遲' => '遅',
1694 '癡' => '痴',
1695 '蟲' => '虫',
1696 '晝' => '昼',
1697 '鑄' => '鋳',
1698 '廳' => '庁',
1699 '聽' => '聴',
1700 '敕' => '勅',
1701 '鎭' => '鎮',
1702 '遞' => '逓',
1703 '鐵' => '鉄',
1704 '轉' => '転',
1705 '傳' => '伝',
1706 '黨' => '党',
1707 '盜' => '盗',
1708 '燈' => '灯',
1709 '當' => '当',
1710 '鬪' => '闘',
1711 '德' => '徳',
1712 '獨' => '独',
1713 '讀' => '読',
1714 '屆' => '届',
1715 '繩' => '縄',
1716 '貳' => '弐',
1717 '惱' => '悩',
1718 '腦' => '脳',
1719 '霸' => '覇',
1720 '廢' => '廃',
1721 '賣' => '売',
1722 '發' => '発',
1723 '髮' => '髪',
1724 '拔' => '抜',
1725 '蠻' => '蛮',
1726 '祕' => '秘',
1727 '濱' => '浜',
1728 '拂' => '払',
1729 '佛' => '仏',
1730 '竝' => '並',
1731 '變' => '変',
1732 '邊' => '辺',
1733 '辯' => '弁',
1734 '辨' => '弁',
1735 '瓣' => '弁',
1736 '舖' => '舗',
1737 '寶' => '宝',
1738 '豐' => '豊',
1739 '沒' => '没',
1740 '飜' => '翻',
1741 '萬' => '万',
1742 '滿' => '満',
1743 '默' => '黙',
1744 '藥' => '薬',
1745 '譯' => '訳',
1746 '豫' => '予',
1747 '餘' => '余',
1748 '與' => '与',
1749 '譽' => '誉',
1750 '搖' => '揺',
1751 '樣' => '様',
1752 '謠' => '謡',
1753 '來' => '来',
1754 '亂' => '乱',
1755 '覽' => '覧',
1756 '龍' => '竜',
1757 '兩' => '両',
1758 '獵' => '猟',
1759 '綠' => '緑',
1760 '壘' => '塁',
1761 '禮' => '礼',
1762 '勞' => '労',
1763 '樓' => '楼',
1764 '灣' => '湾',
1765 _ => c,
1766 }
1767}
1768
1769fn is_variation_selector(c: char) -> bool {
1770 matches!(c, '\u{FE00}'..='\u{FE0F}' | '\u{E0100}'..='\u{E01EF}')
1771}
1772
1773fn is_symbol_or_punctuation(c: char) -> bool {
1774 !c.is_whitespace()
1775 && (c.is_ascii_punctuation()
1776 || matches!(
1777 c,
1778 '\u{2000}'..='\u{206F}'
1779 | '\u{3000}'..='\u{303F}'
1780 | '\u{FE10}'..='\u{FE1F}'
1781 | '\u{FE30}'..='\u{FE4F}'
1782 | '\u{FF01}'..='\u{FF0F}'
1783 | '\u{FF1A}'..='\u{FF20}'
1784 | '\u{FF3B}'..='\u{FF40}'
1785 | '\u{FF5B}'..='\u{FF65}'
1786 | '\u{FFE0}'..='\u{FFE6}'
1787 )
1788 || is_japanese_symbol(c))
1789}
1790
1791fn is_japanese_symbol(c: char) -> bool {
1792 matches!(
1793 c,
1794 '、' | '。'
1795 | '・'
1796 | '「'
1797 | '」'
1798 | '『'
1799 | '』'
1800 | '('
1801 | ')'
1802 | '['
1803 | ']'
1804 | '【'
1805 | '】'
1806 | '〜'
1807 | '~'
1808 | '…'
1809 | '※'
1810 | '〒'
1811 | '〆'
1812 | '〇'
1813 | '〃'
1814 | 'ゝ'
1815 | 'ゞ'
1816 | 'ヽ'
1817 | 'ヾ'
1818 )
1819}
1820
1821#[cfg(test)]
1822mod tests {
1823 use super::*;
1824 use proptest::prelude::*;
1825
1826 #[test]
1827 fn test_to_half_width() {
1828 assert_eq!(to_half_width("ABC"), "ABC");
1829 assert_eq!(to_half_width("123"), "123");
1830 assert_eq!(to_half_width("!@#"), "!@#");
1831 assert_eq!(to_half_width(" "), " ");
1832 assert_eq!(to_half_width("Hello World"), "Hello World");
1833 assert_eq!(to_half_width("ABCあいう"), "ABCあいう");
1835 }
1836
1837 #[test]
1838 fn test_to_full_width() {
1839 assert_eq!(to_full_width("ABC"), "ABC");
1840 assert_eq!(to_full_width("123"), "123");
1841 assert_eq!(to_full_width("!@#"), "!@#");
1842 assert_eq!(to_full_width(" "), " ");
1843 assert_eq!(to_full_width("Hello World"), "Hello World");
1844 assert_eq!(to_full_width("ABCあいう"), "ABCあいう");
1846 }
1847
1848 #[test]
1849 fn test_to_hiragana() {
1850 assert_eq!(to_hiragana("カタカナ"), "かたかな");
1851 assert_eq!(to_hiragana("コンニチハ"), "こんにちは");
1852 assert_eq!(to_hiragana("アイウエオ"), "あいうえお");
1853 assert_eq!(to_hiragana("ヴァイオリン"), "ゔぁいおりん");
1854 assert_eq!(
1855 to_hiragana("ヷヸヹヺ"),
1856 "わ\u{3099}ゐ\u{3099}ゑ\u{3099}を\u{3099}"
1857 );
1858 assert_eq!(to_hiragana("カタカナABC"), "かたかなABC");
1860 }
1861
1862 #[test]
1863 fn test_to_katakana() {
1864 assert_eq!(to_katakana("ひらがな"), "ヒラガナ");
1865 assert_eq!(to_katakana("こんにちは"), "コンニチハ");
1866 assert_eq!(to_katakana("あいうえお"), "アイウエオ");
1867 assert_eq!(to_katakana("ゔぁいおりん"), "ヴァイオリン");
1868 assert_eq!(
1869 to_katakana("わ\u{3099}ゐ\u{3099}ゑ\u{3099}を\u{3099}"),
1870 "ヷヸヹヺ"
1871 );
1872 assert_eq!(to_katakana("か\u{3099}は\u{309A}"), "ガパ");
1873 assert_eq!(to_katakana(&to_hiragana("ヷヸヹヺ")), "ヷヸヹヺ");
1874 assert_eq!(to_katakana("ひらがなABC"), "ヒラガナABC");
1876 }
1877
1878 #[test]
1879 fn test_roundtrip_full_half_width() {
1880 let original = "ABC123!@#";
1881 let full = to_full_width(original);
1882 let back = to_half_width(&full);
1883 assert_eq!(original, back);
1884 }
1885
1886 #[test]
1887 fn test_roundtrip_hiragana_katakana() {
1888 let original = "こんにちは";
1889 let katakana = to_katakana(original);
1890 let back = to_hiragana(&katakana);
1891 assert_eq!(original, back);
1892 }
1893
1894 #[test]
1895 fn test_empty_string() {
1896 assert_eq!(to_half_width(""), "");
1897 assert_eq!(to_full_width(""), "");
1898 assert_eq!(to_hiragana(""), "");
1899 assert_eq!(to_katakana(""), "");
1900 }
1901
1902 #[test]
1903 fn test_is_hiragana() {
1904 assert!(is_hiragana('あ'));
1905 assert!(is_hiragana('ん'));
1906 assert!(!is_hiragana('ア'));
1907 assert!(!is_hiragana('A'));
1908 assert!(!is_hiragana('漢'));
1909 }
1910
1911 #[test]
1912 fn test_is_katakana() {
1913 assert!(is_katakana('ア'));
1914 assert!(is_katakana('ン'));
1915 assert!(is_katakana('ー'));
1916 assert!(is_katakana('ヷ'));
1917 assert!(is_katakana('ヸ'));
1918 assert!(is_katakana('ヹ'));
1919 assert!(is_katakana('ヺ'));
1920 assert!(!is_katakana('あ'));
1921 assert!(!is_katakana('A'));
1922 }
1923
1924 #[test]
1925 fn test_is_half_width_katakana() {
1926 assert!(is_half_width_katakana('ア'));
1927 assert!(is_half_width_katakana('ン'));
1928 assert!(is_half_width_katakana('゙'));
1929 assert!(is_half_width_katakana('゚'));
1930 assert!(!is_half_width_katakana('。'));
1931 assert!(!is_half_width_katakana('「'));
1932 assert!(!is_half_width_katakana('、'));
1933 assert!(!is_half_width_katakana('ア'));
1934 assert!(!is_half_width_katakana('A'));
1935 }
1936
1937 #[test]
1938 fn test_is_kanji() {
1939 assert!(is_kanji('漢'));
1940 assert!(is_kanji('字'));
1941 assert!(!is_kanji('あ'));
1942 assert!(!is_kanji('A'));
1943 }
1944
1945 #[test]
1946 fn test_is_full_width() {
1947 assert!(is_full_width('A'));
1948 assert!(is_full_width('1'));
1949 assert!(is_full_width('ア'));
1950 assert!(is_full_width('あ'));
1951 assert!(is_full_width('漢'));
1952 assert!(is_full_width('、'));
1953 assert!(is_full_width(' '));
1954 assert!(!is_full_width('A'));
1955 assert!(!is_full_width('ア'));
1956 }
1957
1958 #[test]
1959 fn test_count_character_types() {
1960 let counts = count_character_types("あア漢ABC123アイウ");
1961 assert_eq!(counts.hiragana, 1);
1962 assert_eq!(counts.katakana, 1);
1963 assert_eq!(counts.kanji, 1);
1964 assert_eq!(counts.ascii, 6);
1965 assert_eq!(counts.half_width_katakana, 3);
1966 }
1967
1968 #[test]
1969 fn test_normalize_whitespace() {
1970 assert_eq!(normalize_whitespace("Hello World"), "Hello World");
1971 assert_eq!(normalize_whitespace("A\t\t\tB"), "A B");
1972 assert_eq!(
1973 normalize_whitespace(" Multiple Spaces "),
1974 "Multiple Spaces"
1975 );
1976 }
1977
1978 #[test]
1979 fn test_half_width_katakana_to_full_width() {
1980 assert_eq!(half_width_katakana_to_full_width("カタカナ"), "カタカナ");
1981 assert_eq!(half_width_katakana_to_full_width("ガギグゲゴ"), "ガギグゲゴ");
1982 assert_eq!(half_width_katakana_to_full_width("パピプペポ"), "パピプペポ");
1983 assert_eq!(half_width_katakana_to_full_width("ヴヷイ゙エ゙ヺ"), "ヴヷヸヹヺ");
1984 assert_eq!(half_width_katakana_to_full_width("コンニチハ"), "コンニチハ");
1985 }
1986
1987 #[test]
1988 fn test_normalize_prolonged_sound() {
1989 assert_eq!(normalize_prolonged_sound("コーヒー"), "コーヒー");
1990 assert_eq!(normalize_prolonged_sound("コ〜ヒ〜"), "コーヒー");
1991 assert_eq!(normalize_prolonged_sound("ラーメン"), "ラーメン");
1992 }
1993
1994 #[test]
1995 fn test_expand_iteration_marks() {
1996 assert_eq!(expand_iteration_marks("いろゝ"), "いろろ");
1997 assert_eq!(expand_iteration_marks("かゞ"), "かが");
1998 assert_eq!(expand_iteration_marks("うゞ"), "うゔ");
1999 assert_eq!(expand_iteration_marks("いろゝゝ"), "いろろろ");
2000 assert_eq!(expand_iteration_marks("カヽヽ"), "カカカ");
2001 assert_eq!(expand_iteration_marks("トヽキ"), "トトキ");
2002 assert_eq!(expand_iteration_marks("カヾ"), "カガ");
2003 assert_eq!(expand_iteration_marks("ウヾ"), "ウヴ");
2004 }
2005
2006 #[test]
2007 fn test_full_width_katakana_to_half_width() {
2008 assert_eq!(full_width_katakana_to_half_width("カタカナ"), "カタカナ");
2009 assert_eq!(full_width_katakana_to_half_width("ガギグ"), "ガギグ");
2010 assert_eq!(full_width_katakana_to_half_width("パピプ"), "パピプ");
2011 assert_eq!(full_width_katakana_to_half_width("ヷヸヹヺ"), "ヷイ゙エ゙ヺ");
2012 assert_eq!(full_width_katakana_to_half_width("日本語ABC"), "日本語ABC");
2013 }
2014
2015 #[test]
2016 fn test_dakuten_normalization() {
2017 assert_eq!(combine_dakuten("か\u{3099}ハ\u{309A}"), "がパ");
2018 assert_eq!(decompose_dakuten("がパ"), "か\u{3099}ハ\u{309A}");
2019 assert_eq!(combine_dakuten("e\u{301} か\u{3099}"), "e\u{301} が");
2020 assert_eq!(decompose_dakuten("é がパ"), "é か\u{3099}ハ\u{309A}");
2021 assert_eq!(
2022 combine_dakuten("ワ\u{3099}ヰ\u{3099}ヱ\u{3099}ヲ\u{3099}"),
2023 "ヷヸヹヺ"
2024 );
2025 assert_eq!(
2026 decompose_dakuten("ヷヸヹヺ"),
2027 "ワ\u{3099}ヰ\u{3099}ヱ\u{3099}ヲ\u{3099}"
2028 );
2029 }
2030
2031 #[test]
2032 fn test_unicode_normalization() {
2033 assert_eq!(normalize_nfkc("ABC123ガ"), "ABC123ガ");
2034 assert_eq!(normalize_nfc("か\u{3099}"), "が");
2035 assert_eq!(normalize_nfd("が"), "か\u{3099}");
2036 assert_eq!(normalize_nfd("é が"), "e\u{301} か\u{3099}");
2037
2038 let options = NormalizeOptions {
2039 unicode: Some(UnicodeNormalizationForm::Nfd),
2040 ..NormalizeOptions::default()
2041 };
2042 assert_eq!(normalize_with_options("é が", &options), "e\u{301} が");
2043 }
2044
2045 #[test]
2046 fn test_normalize_punctuation_brackets_symbols() {
2047 assert_eq!(normalize_punctuation("A,B.C、D。"), "A、B。C、D。");
2048 assert_eq!(normalize_brackets_and_quotes("(\"本文\")"), "(「本文」)");
2049 assert_eq!(
2050 normalize_brackets_and_quotes("“本文” ‘注’"),
2051 "「本文」 『注』"
2052 );
2053 assert_eq!(normalize_symbols("コ〜ヒ~ - − —"), "コーヒー - - -");
2054 }
2055
2056 #[test]
2057 fn test_old_kanji_and_variation_selectors() {
2058 assert_eq!(old_kanji_to_new("舊字體の國語"), "旧字体の国語");
2059 assert_eq!(remove_variation_selectors("葛\u{E0100}"), "葛");
2060 }
2061
2062 #[test]
2063 fn test_character_type_ratios_and_analysis() {
2064 let ratios = character_type_ratios("あア漢A");
2065 assert_eq!(ratios.hiragana, 0.25);
2066 assert_eq!(ratios.katakana, 0.25);
2067 assert_eq!(ratios.kanji, 0.25);
2068 assert_eq!(ratios.ascii, 0.25);
2069
2070 assert!(is_mostly_japanese("日本語です", 0.8));
2071 assert!(is_mostly_japanese("スーパー", 1.0));
2072 assert!(!is_mostly_japanese("ABC123", 0.5));
2073 assert!(has_mixed_scripts("日本語ABC"));
2074 assert_eq!(extract_japanese("ABC日本語123"), "日本語");
2075 assert_eq!(extract_japanese("ABCスーパー123"), "スーパー");
2076 assert_eq!(extract_ascii("ABC日本語123"), "ABC123");
2077 assert_eq!(remove_symbols("日本語、ABC!"), "日本語ABC");
2078 assert_eq!(remove_symbols("スーパー、コーヒー!"), "スーパーコーヒー");
2079 assert_eq!(remove_symbols("日本語!#【ABC】※"), "日本語ABC");
2080 assert_eq!(remove_symbols("日本語 ABC DEF!"), "日本語 ABC DEF");
2081 }
2082
2083 #[test]
2084 fn test_normalize_default_and_options() {
2085 assert_eq!(normalize("ABC ガギグ,舊字體"), "ABC ガギグ、旧字体");
2086 assert_eq!(normalize("コ~ヒ~とラ〜メン"), "コーヒーとラーメン");
2087
2088 let options = NormalizeOptions {
2089 hiragana: true,
2090 half_width_ascii: true,
2091 punctuation: true,
2092 whitespace: WhitespaceMode::Collapse,
2093 ..NormalizeOptions::default()
2094 };
2095 assert_eq!(
2096 normalize_with_options("ABC カタカナ.", &options),
2097 "ABC かたかな。"
2098 );
2099
2100 let decompose_options = NormalizeOptions {
2101 decompose_dakuten: true,
2102 ..NormalizeOptions::default()
2103 };
2104 assert_eq!(
2105 normalize_with_options("ガ パ ヴ", &decompose_options),
2106 "カ\u{3099} ハ\u{309A} ウ\u{3099}"
2107 );
2108 }
2109
2110 #[test]
2111 fn test_normalizer_builder() {
2112 let normalizer = Normalizer::new()
2113 .hiragana(true)
2114 .half_width_ascii(true)
2115 .whitespace(WhitespaceMode::Collapse);
2116
2117 assert_eq!(normalizer.normalize("ABC カタカナ"), "ABC かたかな");
2118 }
2119
2120 #[test]
2121 fn test_normalizer_builder_last_direction_wins() {
2122 assert_eq!(
2123 Normalizer::new()
2124 .full_width_ascii(true)
2125 .half_width_ascii(true)
2126 .normalize("ABC ABC"),
2127 "ABC ABC"
2128 );
2129 assert_eq!(
2130 Normalizer::new()
2131 .half_width_ascii(true)
2132 .full_width_ascii(true)
2133 .normalize("ABC ABC"),
2134 "ABC ABC"
2135 );
2136 assert_eq!(
2137 Normalizer::new()
2138 .katakana(true)
2139 .hiragana(true)
2140 .normalize("カタカナ ひらがな"),
2141 "かたかな ひらがな"
2142 );
2143 assert_eq!(
2144 Normalizer::new()
2145 .half_width_katakana(false)
2146 .full_width_katakana(true)
2147 .normalize("カタカナ カタカナ"),
2148 "カタカナ カタカナ"
2149 );
2150 }
2151
2152 #[test]
2153 fn test_normalizer_builder_controls_all_options() {
2154 let normalizer = Normalizer::new()
2155 .unicode(UnicodeNormalizationForm::Nfkc)
2156 .unicode_normalization(None)
2157 .half_width_ascii(false)
2158 .half_width_katakana(false)
2159 .combine_dakuten(false)
2160 .decompose_dakuten(true)
2161 .punctuation(false)
2162 .brackets(false)
2163 .symbols(false)
2164 .old_kanji(false)
2165 .remove_variation_selectors(false)
2166 .expand_iteration_marks(false)
2167 .preserve_ascii_tokens(true)
2168 .whitespace(WhitespaceMode::Preserve);
2169
2170 assert_eq!(normalizer.options().unicode, None);
2171 assert!(normalizer.options().decompose_dakuten);
2172 assert!(!normalizer.options().combine_dakuten);
2173 assert_eq!(
2174 normalizer.normalize("舊字體,(カゝ) か\u{3099}"),
2175 "舊字體,(カゝ) か\u{3099}"
2176 );
2177 }
2178
2179 #[test]
2180 fn test_preserve_ascii_tokens() {
2181 let options = NormalizeOptions {
2182 preserve_ascii_tokens: true,
2183 ..NormalizeOptions::default()
2184 };
2185
2186 assert_eq!(
2187 normalize_with_options("URL https://example.com/a,b と ABC,", &options),
2188 "URL https://example.com/a,b と ABC、"
2189 );
2190 assert_eq!(
2191 normalize_with_options(
2192 "参照 (https://example.com/a,b), mail: user.name@example.com.",
2193 &options
2194 ),
2195 "参照 (https://example.com/a,b)、 mail: user.name@example.com。"
2196 );
2197 assert_eq!(
2198 normalize_with_options("価格 1,234.50,版 1.2.3.", &options),
2199 "価格 1,234.50、版 1.2.3."
2200 );
2201 assert_eq!(
2202 normalize_with_options("URL:https://example.com/a,b.", &options),
2203 "URL:https://example.com/a,b。"
2204 );
2205 assert_eq!(
2206 normalize_with_options("mail:user.name@example.com.", &options),
2207 "mail:user.name@example.com。"
2208 );
2209 }
2210
2211 proptest! {
2212 #[test]
2213 fn prop_full_half_ascii_roundtrip(input in "[ -~]*") {
2214 prop_assert_eq!(to_half_width(&to_full_width(&input)), input);
2215 }
2216
2217 #[test]
2218 fn prop_kana_roundtrip(input in "[ぁ-ゖ]*") {
2219 prop_assert_eq!(to_hiragana(&to_katakana(&input)), input);
2220 }
2221 }
2222}