japanese_text/
lib.rs

1//! # japanese-text
2//!
3//! 日本語テキスト正規化のための軽量なRustライブラリ
4//!
5//! ## 特徴
6//!
7//! - 全角⇔半角変換（ASCII文字）
8//! - カタカナ⇔ひらがな変換
9//! - Unicode正規化と日本語向け正規化をまとめて適用できるAPI
10//!
11//! ## 使用例
12//!
13//! ```
14//! use japanese_text::*;
15//!
16//! // 全角→半角変換
17//! assert_eq!(to_half_width("ＡＢＣ１２３"), "ABC123");
18//!
19//! // 半角→全角変換
20//! assert_eq!(to_full_width("ABC123"), "ＡＢＣ１２３");
21//!
22//! // カタカナ→ひらがな変換
23//! assert_eq!(to_hiragana("カタカナ"), "かたかな");
24//!
25//! // ひらがな→カタカナ変換
26//! assert_eq!(to_katakana("ひらがな"), "ヒラガナ");
27//! ```
28
29use unicode_normalization::UnicodeNormalization;
30
31/// Unicode正規化形式。
32#[derive(Debug, Clone, Copy, PartialEq, Eq)]
33pub enum UnicodeNormalizationForm {
34    Nfc,
35    Nfd,
36    Nfkc,
37    Nfkd,
38}
39
40/// 空白正規化の方式。
41#[derive(Debug, Clone, Copy, PartialEq, Eq)]
42pub enum WhitespaceMode {
43    Preserve,
44    Collapse,
45    Trim,
46}
47
48/// 一括正規化のオプション。
49#[derive(Debug, Clone, PartialEq, Eq)]
50pub struct NormalizeOptions {
51    pub unicode: Option<UnicodeNormalizationForm>,
52    pub half_width_ascii: bool,
53    pub full_width_ascii: bool,
54    pub hiragana: bool,
55    pub katakana: bool,
56    pub half_width_katakana: bool,
57    pub full_width_katakana: bool,
58    pub combine_dakuten: bool,
59    pub decompose_dakuten: bool,
60    pub punctuation: bool,
61    pub brackets: bool,
62    pub symbols: bool,
63    pub old_kanji: bool,
64    pub remove_variation_selectors: bool,
65    pub expand_iteration_marks: bool,
66    pub whitespace: WhitespaceMode,
67    pub preserve_ascii_tokens: bool,
68}
69
70impl Default for NormalizeOptions {
71    fn default() -> Self {
72        Self {
73            unicode: None,
74            half_width_ascii: true,
75            full_width_ascii: false,
76            hiragana: false,
77            katakana: false,
78            half_width_katakana: true,
79            full_width_katakana: false,
80            combine_dakuten: true,
81            decompose_dakuten: false,
82            punctuation: true,
83            brackets: true,
84            symbols: true,
85            old_kanji: true,
86            remove_variation_selectors: true,
87            expand_iteration_marks: true,
88            whitespace: WhitespaceMode::Collapse,
89            preserve_ascii_tokens: false,
90        }
91    }
92}
93
94/// 複数の正規化処理をまとめて適用するビルダー。
95#[derive(Debug, Clone, Default, PartialEq, Eq)]
96pub struct Normalizer {
97    options: NormalizeOptions,
98}
99
100impl Normalizer {
101    pub fn new() -> Self {
102        Self::default()
103    }
104
105    pub fn with_options(options: NormalizeOptions) -> Self {
106        Self { options }
107    }
108
109    pub fn unicode(mut self, form: UnicodeNormalizationForm) -> Self {
110        self.options.unicode = Some(form);
111        self
112    }
113
114    pub fn unicode_normalization(mut self, form: Option<UnicodeNormalizationForm>) -> Self {
115        self.options.unicode = form;
116        self
117    }
118
119    pub fn half_width_ascii(mut self, enabled: bool) -> Self {
120        self.options.half_width_ascii = enabled;
121        if enabled {
122            self.options.full_width_ascii = false;
123        }
124        self
125    }
126
127    pub fn full_width_ascii(mut self, enabled: bool) -> Self {
128        self.options.full_width_ascii = enabled;
129        if enabled {
130            self.options.half_width_ascii = false;
131        }
132        self
133    }
134
135    pub fn hiragana(mut self, enabled: bool) -> Self {
136        self.options.hiragana = enabled;
137        if enabled {
138            self.options.katakana = false;
139        }
140        self
141    }
142
143    pub fn katakana(mut self, enabled: bool) -> Self {
144        self.options.katakana = enabled;
145        if enabled {
146            self.options.hiragana = false;
147        }
148        self
149    }
150
151    pub fn half_width_katakana(mut self, enabled: bool) -> Self {
152        self.options.half_width_katakana = enabled;
153        if enabled {
154            self.options.full_width_katakana = false;
155        }
156        self
157    }
158
159    pub fn full_width_katakana(mut self, enabled: bool) -> Self {
160        self.options.full_width_katakana = enabled;
161        if enabled {
162            self.options.half_width_katakana = false;
163        }
164        self
165    }
166
167    pub fn whitespace(mut self, mode: WhitespaceMode) -> Self {
168        self.options.whitespace = mode;
169        self
170    }
171
172    pub fn combine_dakuten(mut self, enabled: bool) -> Self {
173        self.options.combine_dakuten = enabled;
174        if enabled {
175            self.options.decompose_dakuten = false;
176        }
177        self
178    }
179
180    pub fn decompose_dakuten(mut self, enabled: bool) -> Self {
181        self.options.decompose_dakuten = enabled;
182        if enabled {
183            self.options.combine_dakuten = false;
184        }
185        self
186    }
187
188    pub fn punctuation(mut self, enabled: bool) -> Self {
189        self.options.punctuation = enabled;
190        self
191    }
192
193    pub fn brackets(mut self, enabled: bool) -> Self {
194        self.options.brackets = enabled;
195        self
196    }
197
198    pub fn symbols(mut self, enabled: bool) -> Self {
199        self.options.symbols = enabled;
200        self
201    }
202
203    pub fn old_kanji(mut self, enabled: bool) -> Self {
204        self.options.old_kanji = enabled;
205        self
206    }
207
208    pub fn remove_variation_selectors(mut self, enabled: bool) -> Self {
209        self.options.remove_variation_selectors = enabled;
210        self
211    }
212
213    pub fn expand_iteration_marks(mut self, enabled: bool) -> Self {
214        self.options.expand_iteration_marks = enabled;
215        self
216    }
217
218    pub fn preserve_ascii_tokens(mut self, enabled: bool) -> Self {
219        self.options.preserve_ascii_tokens = enabled;
220        self
221    }
222
223    pub fn options(&self) -> &NormalizeOptions {
224        &self.options
225    }
226
227    pub fn normalize(&self, input: &str) -> String {
228        normalize_with_options(input, &self.options)
229    }
230}
231
232/// 全角ASCII文字を半角に変換します。
233///
234/// この関数は全角の英数字や記号（U+FF01-U+FF5E）を、
235/// 対応する半角ASCII文字（U+0021-U+007E）に変換します。
236///
237/// # 使用例
238///
239/// ```
240/// use japanese_text::to_half_width;
241///
242/// assert_eq!(to_half_width("ＡＢＣ"), "ABC");
243/// assert_eq!(to_half_width("１２３"), "123");
244/// assert_eq!(to_half_width("！＠＃"), "!@#");
245/// assert_eq!(to_half_width("Hello　World"), "Hello World");
246/// ```
247pub fn to_half_width(input: &str) -> String {
248    map_chars(input, |c| match c {
249        '　' => ' ',
250        '\u{FF01}'..='\u{FF5E}' => shift_char(c, 0xFF01, 0x0021),
251        _ => c,
252    })
253}
254
255/// 半角ASCII文字を全角に変換します。
256///
257/// この関数は半角ASCII文字（U+0021-U+007E）を、
258/// 対応する全角文字（U+FF01-U+FF5E）に変換します。
259///
260/// # 使用例
261///
262/// ```
263/// use japanese_text::to_full_width;
264///
265/// assert_eq!(to_full_width("ABC"), "ＡＢＣ");
266/// assert_eq!(to_full_width("123"), "１２３");
267/// assert_eq!(to_full_width("!@#"), "！＠＃");
268/// assert_eq!(to_full_width("Hello World"), "Ｈｅｌｌｏ　Ｗｏｒｌｄ");
269/// ```
270pub fn to_full_width(input: &str) -> String {
271    map_chars(input, |c| match c {
272        ' ' => '　',
273        '\u{0021}'..='\u{007E}' => shift_char(c, 0x0021, 0xFF01),
274        _ => c,
275    })
276}
277
278/// カタカナをひらがなに変換します。
279///
280/// この関数はカタカナ文字（U+30A1-U+30F6）を、
281/// 対応するひらがな文字（U+3041-U+3096）に変換します。
282///
283/// # 使用例
284///
285/// ```
286/// use japanese_text::to_hiragana;
287///
288/// assert_eq!(to_hiragana("カタカナ"), "かたかな");
289/// assert_eq!(to_hiragana("コンニチハ"), "こんにちは");
290/// assert_eq!(to_hiragana("ヴァイオリン"), "ゔぁいおりん");
291/// assert_eq!(to_hiragana("ヷヸヹヺ"), "わ\u{3099}ゐ\u{3099}ゑ\u{3099}を\u{3099}");
292/// ```
293pub fn to_hiragana(input: &str) -> String {
294    let mut result = String::new();
295
296    for c in input.chars() {
297        match c {
298            '\u{30A1}'..='\u{30F6}' => result.push(shift_char(c, 0x30A1, 0x3041)),
299            'ヷ' => result.push_str("わ\u{3099}"),
300            'ヸ' => result.push_str("ゐ\u{3099}"),
301            'ヹ' => result.push_str("ゑ\u{3099}"),
302            'ヺ' => result.push_str("を\u{3099}"),
303            _ => result.push(c),
304        }
305    }
306
307    result
308}
309
310/// ひらがなをカタカナに変換します。
311///
312/// この関数はひらがな文字（U+3041-U+3096）を、
313/// 対応するカタカナ文字（U+30A1-U+30F6）に変換します。
314///
315/// # 使用例
316///
317/// ```
318/// use japanese_text::to_katakana;
319///
320/// assert_eq!(to_katakana("ひらがな"), "ヒラガナ");
321/// assert_eq!(to_katakana("こんにちは"), "コンニチハ");
322/// assert_eq!(to_katakana("ゔぁいおりん"), "ヴァイオリン");
323/// assert_eq!(to_katakana("わ\u{3099}ゐ\u{3099}ゑ\u{3099}を\u{3099}"), "ヷヸヹヺ");
324/// ```
325pub fn to_katakana(input: &str) -> String {
326    let mut result = String::new();
327    let mut chars = input.chars().peekable();
328
329    while let Some(c) = chars.next() {
330        match chars.peek().copied() {
331            Some('\u{3099}') => {
332                if let Some(voiced) = voiced_hiragana_to_katakana(c) {
333                    result.push(voiced);
334                    chars.next();
335                    continue;
336                }
337            }
338            Some('\u{309A}') => {
339                if let Some(semi_voiced) = semi_voiced_hiragana_to_katakana(c) {
340                    result.push(semi_voiced);
341                    chars.next();
342                    continue;
343                }
344            }
345            _ => {}
346        }
347
348        match c {
349            '\u{3041}'..='\u{3096}' => result.push(shift_char(c, 0x3041, 0x30A1)),
350            _ => result.push(c),
351        }
352    }
353
354    result
355}
356
357/// 全角カタカナを半角カタカナに変換します。
358///
359/// 濁点・半濁点付きのカタカナは、半角カタカナと半角濁点・半濁点の
360/// 2文字に分解されます。
361///
362/// ```
363/// use japanese_text::full_width_katakana_to_half_width;
364///
365/// assert_eq!(full_width_katakana_to_half_width("カタカナ"), "ｶﾀｶﾅ");
366/// assert_eq!(full_width_katakana_to_half_width("ガギグ"), "ｶﾞｷﾞｸﾞ");
367/// assert_eq!(full_width_katakana_to_half_width("パピプ"), "ﾊﾟﾋﾟﾌﾟ");
368/// ```
369pub fn full_width_katakana_to_half_width(input: &str) -> String {
370    let mut result = String::new();
371
372    for c in input.chars() {
373        let half = full_width_katakana_char_to_half_width(c);
374        if half.is_empty() {
375            result.push(c);
376        } else {
377            result.push_str(half);
378        }
379    }
380
381    result
382}
383
384/// 濁点・半濁点の結合文字を合成済み文字に変換します。
385///
386/// ```
387/// use japanese_text::combine_dakuten;
388///
389/// assert_eq!(combine_dakuten("か\u{3099}き\u{3099}"), "がぎ");
390/// assert_eq!(combine_dakuten("ハ\u{309A}"), "パ");
391/// ```
392pub fn combine_dakuten(input: &str) -> String {
393    let mut result = String::new();
394    let mut chars = input.chars().peekable();
395
396    while let Some(c) = chars.next() {
397        match chars.peek().copied() {
398            Some('\u{3099}') => {
399                if let Some(voiced) = compose_dakuten(c) {
400                    result.push(voiced);
401                    chars.next();
402                    continue;
403                }
404            }
405            Some('\u{309A}') => {
406                if let Some(semi_voiced) = compose_handakuten(c) {
407                    result.push(semi_voiced);
408                    chars.next();
409                    continue;
410                }
411            }
412            _ => {}
413        }
414
415        result.push(c);
416    }
417
418    result
419}
420
421/// 濁点・半濁点付き文字を基底文字と結合文字に分解します。
422///
423/// ```
424/// use japanese_text::decompose_dakuten;
425///
426/// assert_eq!(decompose_dakuten("が"), "か\u{3099}");
427/// assert_eq!(decompose_dakuten("パ"), "ハ\u{309A}");
428/// ```
429pub fn decompose_dakuten(input: &str) -> String {
430    let mut result = String::new();
431
432    for c in input.chars() {
433        if let Some((base, mark)) = decompose_dakuten_char(c) {
434            result.push(base);
435            result.push(mark);
436        } else {
437            result.push(c);
438        }
439    }
440
441    result
442}
443
444/// Unicode NFC正規化を適用します。
445pub fn normalize_nfc(input: &str) -> String {
446    input.nfc().collect()
447}
448
449/// Unicode NFD正規化を適用します。
450pub fn normalize_nfd(input: &str) -> String {
451    input.nfd().collect()
452}
453
454/// Unicode NFKC正規化を適用します。
455pub fn normalize_nfkc(input: &str) -> String {
456    input.nfkc().collect()
457}
458
459/// Unicode NFKD正規化を適用します。
460pub fn normalize_nfkd(input: &str) -> String {
461    input.nfkd().collect()
462}
463
464/// 句読点を日本語表記に統一します。
465///
466/// ```
467/// use japanese_text::normalize_punctuation;
468///
469/// assert_eq!(normalize_punctuation("A，B．C､D｡"), "A、B。C、D。");
470/// ```
471pub fn normalize_punctuation(input: &str) -> String {
472    map_chars(input, |c| match c {
473        '，' | ',' | '､' => '、',
474        '．' | '.' | '｡' => '。',
475        _ => c,
476    })
477}
478
479/// 括弧と引用符を日本語表記に統一します。
480///
481/// ```
482/// use japanese_text::normalize_brackets_and_quotes;
483///
484/// assert_eq!(normalize_brackets_and_quotes("(\"本文\")"), "（「本文」）");
485/// ```
486pub fn normalize_brackets_and_quotes(input: &str) -> String {
487    let mut result = String::new();
488    let mut double_quote_open = true;
489    let mut single_quote_open = true;
490
491    for c in input.chars() {
492        match c {
493            '(' | '（' | '[' | '［' => result.push('（'),
494            ')' | '）' | ']' | '］' => result.push('）'),
495            '"' => {
496                result.push(if double_quote_open { '「' } else { '」' });
497                double_quote_open = !double_quote_open;
498            }
499            '“' | '〝' => result.push('「'),
500            '”' | '〟' => result.push('」'),
501            '\'' => {
502                result.push(if single_quote_open { '『' } else { '』' });
503                single_quote_open = !single_quote_open;
504            }
505            '‘' => result.push('『'),
506            '’' => result.push('』'),
507            _ => result.push(c),
508        }
509    }
510
511    result
512}
513
514/// 長音、波ダッシュ、マイナス、ハイフン類を正規化します。
515///
516/// ```
517/// use japanese_text::normalize_symbols;
518///
519/// assert_eq!(normalize_symbols("コ〜ヒ～ - − —"), "コーヒー - - -");
520/// ```
521pub fn normalize_symbols(input: &str) -> String {
522    map_chars(input, |c| match c {
523        '〜' | '～' => 'ー',
524        '‐' | '‑' | '‒' | '–' | '—' | '―' | '−' | '﹣' | '－' => '-',
525        _ => c,
526    })
527}
528
529/// 代表的な旧字体を新字体に変換します。
530///
531/// ```
532/// use japanese_text::old_kanji_to_new;
533///
534/// assert_eq!(old_kanji_to_new("舊字體の國語"), "旧字体の国語");
535/// ```
536pub fn old_kanji_to_new(input: &str) -> String {
537    map_chars(input, old_kanji_char_to_new)
538}
539
540/// 異体字セレクタを削除します。
541///
542/// ```
543/// use japanese_text::remove_variation_selectors;
544///
545/// assert_eq!(remove_variation_selectors("葛\u{E0100}"), "葛");
546/// ```
547pub fn remove_variation_selectors(input: &str) -> String {
548    input
549        .chars()
550        .filter(|&c| !is_variation_selector(c))
551        .collect()
552}
553
554/// 既定オプションでテキストを正規化します。
555pub fn normalize(input: &str) -> String {
556    normalize_with_options(input, &NormalizeOptions::default())
557}
558
559/// 指定したオプションでテキストを正規化します。
560pub fn normalize_with_options(input: &str, options: &NormalizeOptions) -> String {
561    if options.preserve_ascii_tokens {
562        return normalize_preserving_ascii_tokens(input, options);
563    }
564
565    normalize_segment(input, options)
566}
567
568/// 文字がひらがなかどうかを判定します。
569///
570/// # 使用例
571///
572/// ```
573/// use japanese_text::is_hiragana;
574///
575/// assert_eq!(is_hiragana('あ'), true);
576/// assert_eq!(is_hiragana('ア'), false);
577/// assert_eq!(is_hiragana('A'), false);
578/// ```
579pub fn is_hiragana(c: char) -> bool {
580    matches!(c, '\u{3041}'..='\u{3096}')
581}
582
583/// 文字がカタカナかどうかを判定します。
584///
585/// # 使用例
586///
587/// ```
588/// use japanese_text::is_katakana;
589///
590/// assert_eq!(is_katakana('ア'), true);
591/// assert_eq!(is_katakana('ー'), true);
592/// assert_eq!(is_katakana('ヷ'), true);
593/// assert_eq!(is_katakana('あ'), false);
594/// assert_eq!(is_katakana('A'), false);
595/// ```
596pub fn is_katakana(c: char) -> bool {
597    matches!(c, '\u{30A1}'..='\u{30FA}' | 'ー')
598}
599
600/// 文字が半角カタカナかどうかを判定します。
601///
602/// # 使用例
603///
604/// ```
605/// use japanese_text::is_half_width_katakana;
606///
607/// assert_eq!(is_half_width_katakana('ｱ'), true);
608/// assert_eq!(is_half_width_katakana('ア'), false);
609/// assert_eq!(is_half_width_katakana('｡'), false);
610/// assert_eq!(is_half_width_katakana('A'), false);
611/// ```
612pub fn is_half_width_katakana(c: char) -> bool {
613    matches!(c, '\u{FF66}'..='\u{FF9F}')
614}
615
616/// 文字が漢字（CJK統合漢字）かどうかを判定します。
617///
618/// # 使用例
619///
620/// ```
621/// use japanese_text::is_kanji;
622///
623/// assert_eq!(is_kanji('漢'), true);
624/// assert_eq!(is_kanji('字'), true);
625/// assert_eq!(is_kanji('あ'), false);
626/// assert_eq!(is_kanji('A'), false);
627/// ```
628pub fn is_kanji(c: char) -> bool {
629    matches!(c, '\u{4E00}'..='\u{9FFF}')
630}
631
632/// 文字が全角文字かどうかを判定します。
633///
634/// # 使用例
635///
636/// ```
637/// use japanese_text::is_full_width;
638///
639/// assert_eq!(is_full_width('Ａ'), true);
640/// assert_eq!(is_full_width('１'), true);
641/// assert_eq!(is_full_width('ア'), true);
642/// assert_eq!(is_full_width('漢'), true);
643/// assert_eq!(is_full_width('A'), false);
644/// ```
645pub fn is_full_width(c: char) -> bool {
646    is_hiragana(c)
647        || is_katakana(c)
648        || is_kanji(c)
649        || matches!(
650            c,
651            '　'
652                | '\u{3000}'..='\u{303F}'
653                | '\u{30A0}'..='\u{30FF}'
654                | '\u{FF01}'..='\u{FF5E}'
655                | '\u{FFE0}'..='\u{FFE6}'
656        )
657}
658
659/// 文字列内の各文字種の数をカウントします。
660///
661/// # 使用例
662///
663/// ```
664/// use japanese_text::count_character_types;
665///
666/// let counts = count_character_types("あア漢ABC123");
667/// assert_eq!(counts.hiragana, 1);
668/// assert_eq!(counts.katakana, 1);
669/// assert_eq!(counts.kanji, 1);
670/// assert_eq!(counts.ascii, 6);
671/// ```
672#[derive(Debug, Clone, Default, PartialEq, Eq)]
673pub struct CharacterTypes {
674    pub hiragana: usize,
675    pub katakana: usize,
676    pub half_width_katakana: usize,
677    pub kanji: usize,
678    pub ascii: usize,
679    pub full_width: usize,
680    pub other: usize,
681}
682
683/// 文字種ごとの比率。
684#[derive(Debug, Clone, Default, PartialEq)]
685pub struct CharacterTypeRatios {
686    pub hiragana: f64,
687    pub katakana: f64,
688    pub half_width_katakana: f64,
689    pub kanji: f64,
690    pub ascii: f64,
691    pub full_width: f64,
692    pub other: f64,
693}
694
695pub fn count_character_types(input: &str) -> CharacterTypes {
696    let mut counts = CharacterTypes::default();
697
698    for c in input.chars() {
699        if is_hiragana(c) {
700            counts.hiragana += 1;
701        } else if is_katakana(c) {
702            counts.katakana += 1;
703        } else if is_half_width_katakana(c) {
704            counts.half_width_katakana += 1;
705        } else if is_kanji(c) {
706            counts.kanji += 1;
707        } else if c.is_ascii() {
708            counts.ascii += 1;
709        } else if is_full_width(c) {
710            counts.full_width += 1;
711        } else {
712            counts.other += 1;
713        }
714    }
715
716    counts
717}
718
719/// 文字列内の各文字種の比率を計算します。
720pub fn character_type_ratios(input: &str) -> CharacterTypeRatios {
721    let counts = count_character_types(input);
722    let total = input.chars().count() as f64;
723
724    if total == 0.0 {
725        return CharacterTypeRatios::default();
726    }
727
728    CharacterTypeRatios {
729        hiragana: counts.hiragana as f64 / total,
730        katakana: counts.katakana as f64 / total,
731        half_width_katakana: counts.half_width_katakana as f64 / total,
732        kanji: counts.kanji as f64 / total,
733        ascii: counts.ascii as f64 / total,
734        full_width: counts.full_width as f64 / total,
735        other: counts.other as f64 / total,
736    }
737}
738
739/// 日本語文字が指定した比率以上かを判定します。
740pub fn is_mostly_japanese(input: &str, threshold: f64) -> bool {
741    let total = input.chars().count();
742    if total == 0 {
743        return false;
744    }
745
746    let counts = count_character_types(input);
747    let japanese = counts.hiragana + counts.katakana + counts.half_width_katakana + counts.kanji;
748    japanese as f64 / total as f64 >= threshold
749}
750
751/// ひらがな・カタカナ・漢字・ASCIIのうち複数種類が混在しているかを判定します。
752pub fn has_mixed_scripts(input: &str) -> bool {
753    let counts = count_character_types(input);
754    [
755        counts.hiragana,
756        counts.katakana,
757        counts.half_width_katakana,
758        counts.kanji,
759        counts.ascii,
760    ]
761    .into_iter()
762    .filter(|&count| count > 0)
763    .count()
764        > 1
765}
766
767/// 日本語文字だけを抽出します。
768pub fn extract_japanese(input: &str) -> String {
769    input
770        .chars()
771        .filter(|&c| is_hiragana(c) || is_katakana(c) || is_half_width_katakana(c) || is_kanji(c))
772        .collect()
773}
774
775/// ASCII文字だけを抽出します。
776pub fn extract_ascii(input: &str) -> String {
777    input.chars().filter(|c| c.is_ascii()).collect()
778}
779
780/// Unicodeの記号・句読点に分類される文字を削除します。
781pub fn remove_symbols(input: &str) -> String {
782    input
783        .chars()
784        .filter(|&c| !is_symbol_or_punctuation(c))
785        .collect()
786}
787
788/// 文字列内の空白文字を正規化します（全角スペース、タブなどを半角スペースに統一）。
789///
790/// # 使用例
791///
792/// ```
793/// use japanese_text::normalize_whitespace;
794///
795/// assert_eq!(normalize_whitespace("Hello　World"), "Hello World");
796/// assert_eq!(normalize_whitespace("A\t\tB"), "A B");
797/// ```
798pub fn normalize_whitespace(input: &str) -> String {
799    map_chars(input, |c| {
800        if c.is_whitespace() || c == '　' {
801            ' '
802        } else {
803            c
804        }
805    })
806    .split_whitespace()
807    .collect::<Vec<_>>()
808    .join(" ")
809}
810
811/// 半角カタカナを全角カタカナに変換します。
812///
813/// 濁点（゛）と半濁点（゜）も正しく結合されます。
814///
815/// # 使用例
816///
817/// ```
818/// use japanese_text::half_width_katakana_to_full_width;
819///
820/// assert_eq!(half_width_katakana_to_full_width("ｶﾀｶﾅ"), "カタカナ");
821/// assert_eq!(half_width_katakana_to_full_width("ｶﾞｷﾞｸﾞｹﾞｺﾞ"), "ガギグゲゴ");
822/// assert_eq!(half_width_katakana_to_full_width("ﾊﾟﾋﾟﾌﾟﾍﾟﾎﾟ"), "パピプペポ");
823/// ```
824pub fn half_width_katakana_to_full_width(input: &str) -> String {
825    let mut result = String::new();
826    let mut chars = input.chars().peekable();
827
828    while let Some(c) = chars.next() {
829        let converted = match chars.peek().copied() {
830            Some('ﾞ') => voiced_half_width_katakana(c),
831            Some('ﾟ') => semi_voiced_half_width_katakana(c),
832            _ => None,
833        };
834
835        if let Some(full) = converted {
836            result.push(full);
837            chars.next();
838        } else {
839            result.push(half_width_katakana_char_to_full_width(c));
840        }
841    }
842
843    result
844}
845
846/// 長音記号を正規化します（ー、〜、～などを統一）。
847///
848/// # 使用例
849///
850/// ```
851/// use japanese_text::normalize_prolonged_sound;
852///
853/// assert_eq!(normalize_prolonged_sound("コーヒー"), "コーヒー");
854/// assert_eq!(normalize_prolonged_sound("コ〜ヒ〜"), "コーヒー");
855/// ```
856pub fn normalize_prolonged_sound(input: &str) -> String {
857    map_chars(input, |c| match c {
858        '〜' | '～' => 'ー',
859        _ => c,
860    })
861}
862
863/// 繰り返し記号を展開します。
864///
865/// ひらがな・カタカナの繰り返し記号（ゝ、ゞ、ヽ、ヾ）を実際の文字に展開します。
866///
867/// # 使用例
868///
869/// ```
870/// use japanese_text::expand_iteration_marks;
871///
872/// assert_eq!(expand_iteration_marks("いろゝ"), "いろろ");
873/// assert_eq!(expand_iteration_marks("かゞ"), "かが");
874/// ```
875pub fn expand_iteration_marks(input: &str) -> String {
876    let mut result = String::new();
877
878    for c in input.chars() {
879        match c {
880            // ひらがな繰り返し記号（無声音）
881            'ゝ' => {
882                if let Some(prev) = result.chars().last() {
883                    result.push(prev);
884                } else {
885                    result.push(c);
886                }
887            }
888            // ひらがな繰り返し記号（濁音）
889            'ゞ' => {
890                if let Some(prev) = result.chars().last() {
891                    let voiced = add_dakuten(prev);
892                    result.push(voiced);
893                } else {
894                    result.push(c);
895                }
896            }
897            // カタカナ繰り返し記号（無声音）
898            'ヽ' => {
899                if let Some(prev) = result.chars().last() {
900                    result.push(prev);
901                } else {
902                    result.push(c);
903                }
904            }
905            // カタカナ繰り返し記号（濁音）
906            'ヾ' => {
907                if let Some(prev) = result.chars().last() {
908                    let voiced = add_dakuten(prev);
909                    result.push(voiced);
910                } else {
911                    result.push(c);
912                }
913            }
914            _ => result.push(c),
915        }
916    }
917
918    result
919}
920
921fn normalize_segment(input: &str, options: &NormalizeOptions) -> String {
922    let mut text = match options.unicode {
923        Some(UnicodeNormalizationForm::Nfc) => normalize_nfc(input),
924        Some(UnicodeNormalizationForm::Nfd) => normalize_nfd(input),
925        Some(UnicodeNormalizationForm::Nfkc) => normalize_nfkc(input),
926        Some(UnicodeNormalizationForm::Nfkd) => normalize_nfkd(input),
927        None => input.to_string(),
928    };
929
930    if options.remove_variation_selectors {
931        text = remove_variation_selectors(&text);
932    }
933    if options.half_width_katakana {
934        text = half_width_katakana_to_full_width(&text);
935    }
936    if options.hiragana {
937        text = to_hiragana(&text);
938    }
939    if options.katakana {
940        text = to_katakana(&text);
941    }
942    if options.decompose_dakuten {
943        text = decompose_dakuten(&text);
944    } else if options.combine_dakuten {
945        text = combine_dakuten(&text);
946    }
947    if options.full_width_katakana {
948        text = full_width_katakana_to_half_width(&text);
949    }
950    if options.symbols {
951        text = normalize_symbols(&text);
952    }
953    if options.half_width_ascii {
954        text = to_half_width(&text);
955    }
956    if options.full_width_ascii {
957        text = to_full_width(&text);
958    }
959    if options.punctuation {
960        text = normalize_punctuation(&text);
961    }
962    if options.brackets {
963        text = normalize_brackets_and_quotes(&text);
964    }
965    if options.old_kanji {
966        text = old_kanji_to_new(&text);
967    }
968    if options.expand_iteration_marks {
969        text = expand_iteration_marks(&text);
970    }
971
972    match options.whitespace {
973        WhitespaceMode::Preserve => text,
974        WhitespaceMode::Collapse => normalize_whitespace(&text),
975        WhitespaceMode::Trim => text.trim().to_string(),
976    }
977}
978
979fn normalize_preserving_ascii_tokens(input: &str, options: &NormalizeOptions) -> String {
980    let mut result = String::new();
981    let mut ascii_run = String::new();
982    let mut normal_run = String::new();
983
984    for c in input.chars() {
985        if c.is_ascii() && !c.is_ascii_whitespace() {
986            push_normalized_segment(&mut result, &normal_run, options);
987            normal_run.clear();
988            ascii_run.push(c);
989        } else {
990            push_normalized_or_preserved_token(&mut result, &ascii_run, options);
991            ascii_run.clear();
992            normal_run.push(c);
993        }
994    }
995
996    push_normalized_or_preserved_token(&mut result, &ascii_run, options);
997    push_normalized_segment(&mut result, &normal_run, options);
998
999    match options.whitespace {
1000        WhitespaceMode::Preserve => result,
1001        WhitespaceMode::Collapse => normalize_whitespace(&result),
1002        WhitespaceMode::Trim => result.trim().to_string(),
1003    }
1004}
1005
1006fn push_normalized_segment(result: &mut String, segment: &str, options: &NormalizeOptions) {
1007    if segment.is_empty() {
1008        return;
1009    }
1010
1011    let mut segment_options = options.clone();
1012    segment_options.preserve_ascii_tokens = false;
1013    segment_options.whitespace = WhitespaceMode::Preserve;
1014    result.push_str(&normalize_segment(segment, &segment_options));
1015}
1016
1017fn push_normalized_or_preserved_token(
1018    result: &mut String,
1019    token: &str,
1020    options: &NormalizeOptions,
1021) {
1022    if token.is_empty() {
1023        return;
1024    }
1025
1026    if let Some((leading, preserved, trailing)) = split_preserved_ascii_token(token) {
1027        push_normalized_segment(result, leading, options);
1028        result.push_str(preserved);
1029        push_normalized_segment(result, trailing, options);
1030    } else {
1031        push_normalized_segment(result, token, options);
1032    }
1033}
1034
1035fn split_preserved_ascii_token(token: &str) -> Option<(&str, &str, &str)> {
1036    if is_number_like(token) {
1037        return Some(("", token, ""));
1038    }
1039
1040    let leading_start = token
1041        .char_indices()
1042        .find(|&(_, c)| !is_ascii_token_leading_delimiter(c))
1043        .map(|(idx, _)| idx)
1044        .unwrap_or(token.len());
1045    let (leading, rest) = token.split_at(leading_start);
1046
1047    let mut core_end = rest.len();
1048    while core_end > 0 {
1049        let mut chars = rest[..core_end].char_indices();
1050        let Some((idx, c)) = chars.next_back() else {
1051            break;
1052        };
1053
1054        if is_ascii_token_trailing_delimiter(c) {
1055            core_end = idx;
1056        } else {
1057            break;
1058        }
1059    }
1060
1061    let candidate = &rest[..core_end];
1062
1063    if let Some((preserved_start, preserved_end)) = find_preserved_ascii_core(candidate) {
1064        let preserved_start = leading.len() + preserved_start;
1065        let preserved_end = leading.len() + preserved_end;
1066        Some((
1067            &token[..preserved_start],
1068            &token[preserved_start..preserved_end],
1069            &token[preserved_end..],
1070        ))
1071    } else {
1072        None
1073    }
1074}
1075
1076fn is_url_like(token: &str) -> bool {
1077    token.starts_with("http://") || token.starts_with("https://")
1078}
1079
1080fn find_preserved_ascii_core(token: &str) -> Option<(usize, usize)> {
1081    if is_url_like(token) || is_email_like(token) || is_number_like(token) {
1082        return Some((0, token.len()));
1083    }
1084
1085    let url_start = match (token.find("http://"), token.find("https://")) {
1086        (Some(http), Some(https)) => Some(http.min(https)),
1087        (Some(http), None) => Some(http),
1088        (None, Some(https)) => Some(https),
1089        (None, None) => None,
1090    };
1091    if let Some(start) = url_start {
1092        return Some((start, token.len()));
1093    }
1094
1095    token
1096        .char_indices()
1097        .find_map(|(start, _)| is_email_like(&token[start..]).then_some((start, token.len())))
1098}
1099
1100fn is_ascii_token_leading_delimiter(c: char) -> bool {
1101    matches!(
1102        c,
1103        '(' | '[' | '{' | '<' | '"' | '\'' | '（' | '［' | '｛' | '「' | '『'
1104    )
1105}
1106
1107fn is_ascii_token_trailing_delimiter(c: char) -> bool {
1108    matches!(
1109        c,
1110        ')' | ']'
1111            | '}'
1112            | '>'
1113            | '"'
1114            | '\''
1115            | ','
1116            | '.'
1117            | '，'
1118            | '．'
1119            | '、'
1120            | '。'
1121            | '）'
1122            | '］'
1123            | '｝'
1124            | '」'
1125            | '』'
1126    )
1127}
1128
1129fn is_email_like(token: &str) -> bool {
1130    let Some((local, domain)) = token.split_once('@') else {
1131        return false;
1132    };
1133
1134    !local.is_empty()
1135        && domain.contains('.')
1136        && domain.len() >= 3
1137        && token
1138            .chars()
1139            .all(|c| c.is_ascii_alphanumeric() || matches!(c, '@' | '.' | '_' | '%' | '+' | '-'))
1140}
1141
1142fn is_number_like(token: &str) -> bool {
1143    let mut has_digit = false;
1144
1145    for c in token.chars() {
1146        if c.is_ascii_digit() {
1147            has_digit = true;
1148        } else if !matches!(c, '.' | ',' | ':' | '/' | '-' | '+' | '%' | '_' | '#') {
1149            return false;
1150        }
1151    }
1152
1153    has_digit
1154}
1155
1156fn map_chars(input: &str, convert: impl Fn(char) -> char) -> String {
1157    input.chars().map(convert).collect()
1158}
1159
1160fn shift_char(c: char, from_start: u32, to_start: u32) -> char {
1161    char::from_u32(c as u32 - from_start + to_start).unwrap_or(c)
1162}
1163
1164fn half_width_katakana_char_to_full_width(c: char) -> char {
1165    match c {
1166        'ｦ' => 'ヲ',
1167        'ｧ' => 'ァ',
1168        'ｨ' => 'ィ',
1169        'ｩ' => 'ゥ',
1170        'ｪ' => 'ェ',
1171        'ｫ' => 'ォ',
1172        'ｬ' => 'ャ',
1173        'ｭ' => 'ュ',
1174        'ｮ' => 'ョ',
1175        'ｯ' => 'ッ',
1176        'ｰ' => 'ー',
1177        'ｱ' => 'ア',
1178        'ｲ' => 'イ',
1179        'ｳ' => 'ウ',
1180        'ｴ' => 'エ',
1181        'ｵ' => 'オ',
1182        'ｶ' => 'カ',
1183        'ｷ' => 'キ',
1184        'ｸ' => 'ク',
1185        'ｹ' => 'ケ',
1186        'ｺ' => 'コ',
1187        'ｻ' => 'サ',
1188        'ｼ' => 'シ',
1189        'ｽ' => 'ス',
1190        'ｾ' => 'セ',
1191        'ｿ' => 'ソ',
1192        'ﾀ' => 'タ',
1193        'ﾁ' => 'チ',
1194        'ﾂ' => 'ツ',
1195        'ﾃ' => 'テ',
1196        'ﾄ' => 'ト',
1197        'ﾅ' => 'ナ',
1198        'ﾆ' => 'ニ',
1199        'ﾇ' => 'ヌ',
1200        'ﾈ' => 'ネ',
1201        'ﾉ' => 'ノ',
1202        'ﾊ' => 'ハ',
1203        'ﾋ' => 'ヒ',
1204        'ﾌ' => 'フ',
1205        'ﾍ' => 'ヘ',
1206        'ﾎ' => 'ホ',
1207        'ﾏ' => 'マ',
1208        'ﾐ' => 'ミ',
1209        'ﾑ' => 'ム',
1210        'ﾒ' => 'メ',
1211        'ﾓ' => 'モ',
1212        'ﾔ' => 'ヤ',
1213        'ﾕ' => 'ユ',
1214        'ﾖ' => 'ヨ',
1215        'ﾗ' => 'ラ',
1216        'ﾘ' => 'リ',
1217        'ﾙ' => 'ル',
1218        'ﾚ' => 'レ',
1219        'ﾛ' => 'ロ',
1220        'ﾜ' => 'ワ',
1221        'ﾝ' => 'ン',
1222        '｡' => '。',
1223        '｢' => '「',
1224        '｣' => '」',
1225        '､' => '、',
1226        '･' => '・',
1227        _ => c,
1228    }
1229}
1230
1231fn full_width_katakana_char_to_half_width(c: char) -> &'static str {
1232    match c {
1233        'ヲ' => "ｦ",
1234        'ァ' => "ｧ",
1235        'ィ' => "ｨ",
1236        'ゥ' => "ｩ",
1237        'ェ' => "ｪ",
1238        'ォ' => "ｫ",
1239        'ャ' => "ｬ",
1240        'ュ' => "ｭ",
1241        'ョ' => "ｮ",
1242        'ッ' => "ｯ",
1243        'ー' => "ｰ",
1244        'ア' => "ｱ",
1245        'イ' => "ｲ",
1246        'ウ' => "ｳ",
1247        'エ' => "ｴ",
1248        'オ' => "ｵ",
1249        'カ' => "ｶ",
1250        'キ' => "ｷ",
1251        'ク' => "ｸ",
1252        'ケ' => "ｹ",
1253        'コ' => "ｺ",
1254        'サ' => "ｻ",
1255        'シ' => "ｼ",
1256        'ス' => "ｽ",
1257        'セ' => "ｾ",
1258        'ソ' => "ｿ",
1259        'タ' => "ﾀ",
1260        'チ' => "ﾁ",
1261        'ツ' => "ﾂ",
1262        'テ' => "ﾃ",
1263        'ト' => "ﾄ",
1264        'ナ' => "ﾅ",
1265        'ニ' => "ﾆ",
1266        'ヌ' => "ﾇ",
1267        'ネ' => "ﾈ",
1268        'ノ' => "ﾉ",
1269        'ハ' => "ﾊ",
1270        'ヒ' => "ﾋ",
1271        'フ' => "ﾌ",
1272        'ヘ' => "ﾍ",
1273        'ホ' => "ﾎ",
1274        'マ' => "ﾏ",
1275        'ミ' => "ﾐ",
1276        'ム' => "ﾑ",
1277        'メ' => "ﾒ",
1278        'モ' => "ﾓ",
1279        'ヤ' => "ﾔ",
1280        'ユ' => "ﾕ",
1281        'ヨ' => "ﾖ",
1282        'ラ' => "ﾗ",
1283        'リ' => "ﾘ",
1284        'ル' => "ﾙ",
1285        'レ' => "ﾚ",
1286        'ロ' => "ﾛ",
1287        'ワ' => "ﾜ",
1288        'ン' => "ﾝ",
1289        'ヷ' => "ﾜﾞ",
1290        'ヸ' => "ｲﾞ",
1291        'ヹ' => "ｴﾞ",
1292        'ヺ' => "ｦﾞ",
1293        'ガ' => "ｶﾞ",
1294        'ギ' => "ｷﾞ",
1295        'グ' => "ｸﾞ",
1296        'ゲ' => "ｹﾞ",
1297        'ゴ' => "ｺﾞ",
1298        'ザ' => "ｻﾞ",
1299        'ジ' => "ｼﾞ",
1300        'ズ' => "ｽﾞ",
1301        'ゼ' => "ｾﾞ",
1302        'ゾ' => "ｿﾞ",
1303        'ダ' => "ﾀﾞ",
1304        'ヂ' => "ﾁﾞ",
1305        'ヅ' => "ﾂﾞ",
1306        'デ' => "ﾃﾞ",
1307        'ド' => "ﾄﾞ",
1308        'バ' => "ﾊﾞ",
1309        'ビ' => "ﾋﾞ",
1310        'ブ' => "ﾌﾞ",
1311        'ベ' => "ﾍﾞ",
1312        'ボ' => "ﾎﾞ",
1313        'ヴ' => "ｳﾞ",
1314        'パ' => "ﾊﾟ",
1315        'ピ' => "ﾋﾟ",
1316        'プ' => "ﾌﾟ",
1317        'ペ' => "ﾍﾟ",
1318        'ポ' => "ﾎﾟ",
1319        '。' => "｡",
1320        '「' => "｢",
1321        '」' => "｣",
1322        '、' => "､",
1323        '・' => "･",
1324        _ => "",
1325    }
1326}
1327
1328fn voiced_half_width_katakana(c: char) -> Option<char> {
1329    Some(match c {
1330        'ｶ' => 'ガ',
1331        'ｷ' => 'ギ',
1332        'ｸ' => 'グ',
1333        'ｹ' => 'ゲ',
1334        'ｺ' => 'ゴ',
1335        'ｻ' => 'ザ',
1336        'ｼ' => 'ジ',
1337        'ｽ' => 'ズ',
1338        'ｾ' => 'ゼ',
1339        'ｿ' => 'ゾ',
1340        'ﾀ' => 'ダ',
1341        'ﾁ' => 'ヂ',
1342        'ﾂ' => 'ヅ',
1343        'ﾃ' => 'デ',
1344        'ﾄ' => 'ド',
1345        'ﾊ' => 'バ',
1346        'ﾋ' => 'ビ',
1347        'ﾌ' => 'ブ',
1348        'ﾍ' => 'ベ',
1349        'ﾎ' => 'ボ',
1350        'ｳ' => 'ヴ',
1351        'ﾜ' => 'ヷ',
1352        'ｲ' => 'ヸ',
1353        'ｴ' => 'ヹ',
1354        'ｦ' => 'ヺ',
1355        _ => return None,
1356    })
1357}
1358
1359fn semi_voiced_half_width_katakana(c: char) -> Option<char> {
1360    Some(match c {
1361        'ﾊ' => 'パ',
1362        'ﾋ' => 'ピ',
1363        'ﾌ' => 'プ',
1364        'ﾍ' => 'ペ',
1365        'ﾎ' => 'ポ',
1366        _ => return None,
1367    })
1368}
1369
1370fn voiced_hiragana_to_katakana(c: char) -> Option<char> {
1371    Some(match c {
1372        'か' => 'ガ',
1373        'き' => 'ギ',
1374        'く' => 'グ',
1375        'け' => 'ゲ',
1376        'こ' => 'ゴ',
1377        'さ' => 'ザ',
1378        'し' => 'ジ',
1379        'す' => 'ズ',
1380        'せ' => 'ゼ',
1381        'そ' => 'ゾ',
1382        'た' => 'ダ',
1383        'ち' => 'ヂ',
1384        'つ' => 'ヅ',
1385        'て' => 'デ',
1386        'と' => 'ド',
1387        'は' => 'バ',
1388        'ひ' => 'ビ',
1389        'ふ' => 'ブ',
1390        'へ' => 'ベ',
1391        'ほ' => 'ボ',
1392        'う' => 'ヴ',
1393        'わ' => 'ヷ',
1394        'ゐ' => 'ヸ',
1395        'ゑ' => 'ヹ',
1396        'を' => 'ヺ',
1397        _ => return None,
1398    })
1399}
1400
1401fn semi_voiced_hiragana_to_katakana(c: char) -> Option<char> {
1402    Some(match c {
1403        'は' => 'パ',
1404        'ひ' => 'ピ',
1405        'ふ' => 'プ',
1406        'へ' => 'ペ',
1407        'ほ' => 'ポ',
1408        _ => return None,
1409    })
1410}
1411
1412fn compose_dakuten(c: char) -> Option<char> {
1413    let voiced = add_dakuten(c);
1414    (voiced != c).then_some(voiced)
1415}
1416
1417fn compose_handakuten(c: char) -> Option<char> {
1418    Some(match c {
1419        'は' => 'ぱ',
1420        'ひ' => 'ぴ',
1421        'ふ' => 'ぷ',
1422        'へ' => 'ぺ',
1423        'ほ' => 'ぽ',
1424        'ハ' => 'パ',
1425        'ヒ' => 'ピ',
1426        'フ' => 'プ',
1427        'ヘ' => 'ペ',
1428        'ホ' => 'ポ',
1429        _ => return None,
1430    })
1431}
1432
1433fn decompose_dakuten_char(c: char) -> Option<(char, char)> {
1434    Some(match c {
1435        'が' => ('か', '\u{3099}'),
1436        'ぎ' => ('き', '\u{3099}'),
1437        'ぐ' => ('く', '\u{3099}'),
1438        'げ' => ('け', '\u{3099}'),
1439        'ご' => ('こ', '\u{3099}'),
1440        'ざ' => ('さ', '\u{3099}'),
1441        'じ' => ('し', '\u{3099}'),
1442        'ず' => ('す', '\u{3099}'),
1443        'ぜ' => ('せ', '\u{3099}'),
1444        'ぞ' => ('そ', '\u{3099}'),
1445        'だ' => ('た', '\u{3099}'),
1446        'ぢ' => ('ち', '\u{3099}'),
1447        'づ' => ('つ', '\u{3099}'),
1448        'で' => ('て', '\u{3099}'),
1449        'ど' => ('と', '\u{3099}'),
1450        'ば' => ('は', '\u{3099}'),
1451        'び' => ('ひ', '\u{3099}'),
1452        'ぶ' => ('ふ', '\u{3099}'),
1453        'べ' => ('へ', '\u{3099}'),
1454        'ぼ' => ('ほ', '\u{3099}'),
1455        'ゔ' => ('う', '\u{3099}'),
1456        'ぱ' => ('は', '\u{309A}'),
1457        'ぴ' => ('ひ', '\u{309A}'),
1458        'ぷ' => ('ふ', '\u{309A}'),
1459        'ぺ' => ('へ', '\u{309A}'),
1460        'ぽ' => ('ほ', '\u{309A}'),
1461        'ガ' => ('カ', '\u{3099}'),
1462        'ギ' => ('キ', '\u{3099}'),
1463        'グ' => ('ク', '\u{3099}'),
1464        'ゲ' => ('ケ', '\u{3099}'),
1465        'ゴ' => ('コ', '\u{3099}'),
1466        'ザ' => ('サ', '\u{3099}'),
1467        'ジ' => ('シ', '\u{3099}'),
1468        'ズ' => ('ス', '\u{3099}'),
1469        'ゼ' => ('セ', '\u{3099}'),
1470        'ゾ' => ('ソ', '\u{3099}'),
1471        'ダ' => ('タ', '\u{3099}'),
1472        'ヂ' => ('チ', '\u{3099}'),
1473        'ヅ' => ('ツ', '\u{3099}'),
1474        'デ' => ('テ', '\u{3099}'),
1475        'ド' => ('ト', '\u{3099}'),
1476        'バ' => ('ハ', '\u{3099}'),
1477        'ビ' => ('ヒ', '\u{3099}'),
1478        'ブ' => ('フ', '\u{3099}'),
1479        'ベ' => ('ヘ', '\u{3099}'),
1480        'ボ' => ('ホ', '\u{3099}'),
1481        'ヴ' => ('ウ', '\u{3099}'),
1482        'ヷ' => ('ワ', '\u{3099}'),
1483        'ヸ' => ('ヰ', '\u{3099}'),
1484        'ヹ' => ('ヱ', '\u{3099}'),
1485        'ヺ' => ('ヲ', '\u{3099}'),
1486        'パ' => ('ハ', '\u{309A}'),
1487        'ピ' => ('ヒ', '\u{309A}'),
1488        'プ' => ('フ', '\u{309A}'),
1489        'ペ' => ('ヘ', '\u{309A}'),
1490        'ポ' => ('ホ', '\u{309A}'),
1491        _ => return None,
1492    })
1493}
1494
1495/// 文字に濁点を追加します（内部ヘルパー関数）。
1496fn add_dakuten(c: char) -> char {
1497    match c {
1498        // ひらがな
1499        'か' => 'が',
1500        'き' => 'ぎ',
1501        'く' => 'ぐ',
1502        'け' => 'げ',
1503        'こ' => 'ご',
1504        'さ' => 'ざ',
1505        'し' => 'じ',
1506        'す' => 'ず',
1507        'せ' => 'ぜ',
1508        'そ' => 'ぞ',
1509        'た' => 'だ',
1510        'ち' => 'ぢ',
1511        'つ' => 'づ',
1512        'て' => 'で',
1513        'と' => 'ど',
1514        'う' => 'ゔ',
1515        'は' => 'ば',
1516        'ひ' => 'び',
1517        'ふ' => 'ぶ',
1518        'へ' => 'べ',
1519        'ほ' => 'ぼ',
1520        // カタカナ
1521        'カ' => 'ガ',
1522        'キ' => 'ギ',
1523        'ク' => 'グ',
1524        'ケ' => 'ゲ',
1525        'コ' => 'ゴ',
1526        'サ' => 'ザ',
1527        'シ' => 'ジ',
1528        'ス' => 'ズ',
1529        'セ' => 'ゼ',
1530        'ソ' => 'ゾ',
1531        'タ' => 'ダ',
1532        'チ' => 'ヂ',
1533        'ツ' => 'ヅ',
1534        'テ' => 'デ',
1535        'ト' => 'ド',
1536        'ウ' => 'ヴ',
1537        'ワ' => 'ヷ',
1538        'ヰ' => 'ヸ',
1539        'ヱ' => 'ヹ',
1540        'ヲ' => 'ヺ',
1541        'ハ' => 'バ',
1542        'ヒ' => 'ビ',
1543        'フ' => 'ブ',
1544        'ヘ' => 'ベ',
1545        'ホ' => 'ボ',
1546        _ => c,
1547    }
1548}
1549
1550fn old_kanji_char_to_new(c: char) -> char {
1551    match c {
1552        '亞' => '亜',
1553        '惡' => '悪',
1554        '壓' => '圧',
1555        '圍' => '囲',
1556        '爲' => '為',
1557        '醫' => '医',
1558        '壹' => '壱',
1559        '稻' => '稲',
1560        '飮' => '飲',
1561        '隱' => '隠',
1562        '營' => '営',
1563        '榮' => '栄',
1564        '驛' => '駅',
1565        '圓' => '円',
1566        '鹽' => '塩',
1567        '奧' => '奥',
1568        '應' => '応',
1569        '歐' => '欧',
1570        '毆' => '殴',
1571        '櫻' => '桜',
1572        '假' => '仮',
1573        '價' => '価',
1574        '畫' => '画',
1575        '會' => '会',
1576        '懷' => '懐',
1577        '壞' => '壊',
1578        '樂' => '楽',
1579        '氣' => '気',
1580        '龜' => '亀',
1581        '僞' => '偽',
1582        '舊' => '旧',
1583        '據' => '拠',
1584        '擧' => '挙',
1585        '峽' => '峡',
1586        '狹' => '狭',
1587        '區' => '区',
1588        '驅' => '駆',
1589        '徑' => '径',
1590        '莖' => '茎',
1591        '惠' => '恵',
1592        '溪' => '渓',
1593        '經' => '経',
1594        '繼' => '継',
1595        '缺' => '欠',
1596        '劍' => '剣',
1597        '檢' => '検',
1598        '權' => '権',
1599        '獻' => '献',
1600        '縣' => '県',
1601        '險' => '険',
1602        '嚴' => '厳',
1603        '廣' => '広',
1604        '鑛' => '鉱',
1605        '號' => '号',
1606        '國' => '国',
1607        '黑' => '黒',
1608        '濟' => '済',
1609        '齋' => '斎',
1610        '劑' => '剤',
1611        '雜' => '雑',
1612        '參' => '参',
1613        '棧' => '桟',
1614        '蠶' => '蚕',
1615        '殘' => '残',
1616        '絲' => '糸',
1617        '齒' => '歯',
1618        '兒' => '児',
1619        '實' => '実',
1620        '舍' => '舎',
1621        '寫' => '写',
1622        '釋' => '釈',
1623        '壽' => '寿',
1624        '從' => '従',
1625        '澁' => '渋',
1626        '獸' => '獣',
1627        '縱' => '縦',
1628        '肅' => '粛',
1629        '處' => '処',
1630        '敍' => '叙',
1631        '將' => '将',
1632        '稱' => '称',
1633        '證' => '証',
1634        '奬' => '奨',
1635        '條' => '条',
1636        '乘' => '乗',
1637        '淨' => '浄',
1638        '剩' => '剰',
1639        '疊' => '畳',
1640        '讓' => '譲',
1641        '釀' => '醸',
1642        '眞' => '真',
1643        '寢' => '寝',
1644        '愼' => '慎',
1645        '盡' => '尽',
1646        '圖' => '図',
1647        '粹' => '粋',
1648        '醉' => '酔',
1649        '穗' => '穂',
1650        '隨' => '随',
1651        '髓' => '髄',
1652        '數' => '数',
1653        '聲' => '声',
1654        '靜' => '静',
1655        '齊' => '斉',
1656        '攝' => '摂',
1657        '竊' => '窃',
1658        '專' => '専',
1659        '戰' => '戦',
1660        '淺' => '浅',
1661        '潛' => '潜',
1662        '遷' => '遷',
1663        '踐' => '践',
1664        '錢' => '銭',
1665        '禪' => '禅',
1666        '雙' => '双',
1667        '壯' => '壮',
1668        '爭' => '争',
1669        '莊' => '荘',
1670        '搜' => '捜',
1671        '插' => '挿',
1672        '巢' => '巣',
1673        '裝' => '装',
1674        '總' => '総',
1675        '騷' => '騒',
1676        '臟' => '臓',
1677        '藏' => '蔵',
1678        '屬' => '属',
1679        '續' => '続',
1680        '墮' => '堕',
1681        '對' => '対',
1682        '體' => '体',
1683        '帶' => '帯',
1684        '滯' => '滞',
1685        '臺' => '台',
1686        '瀧' => '滝',
1687        '擇' => '択',
1688        '澤' => '沢',
1689        '單' => '単',
1690        '膽' => '胆',
1691        '團' => '団',
1692        '彈' => '弾',
1693        '遲' => '遅',
1694        '癡' => '痴',
1695        '蟲' => '虫',
1696        '晝' => '昼',
1697        '鑄' => '鋳',
1698        '廳' => '庁',
1699        '聽' => '聴',
1700        '敕' => '勅',
1701        '鎭' => '鎮',
1702        '遞' => '逓',
1703        '鐵' => '鉄',
1704        '轉' => '転',
1705        '傳' => '伝',
1706        '黨' => '党',
1707        '盜' => '盗',
1708        '燈' => '灯',
1709        '當' => '当',
1710        '鬪' => '闘',
1711        '德' => '徳',
1712        '獨' => '独',
1713        '讀' => '読',
1714        '屆' => '届',
1715        '繩' => '縄',
1716        '貳' => '弐',
1717        '惱' => '悩',
1718        '腦' => '脳',
1719        '霸' => '覇',
1720        '廢' => '廃',
1721        '賣' => '売',
1722        '發' => '発',
1723        '髮' => '髪',
1724        '拔' => '抜',
1725        '蠻' => '蛮',
1726        '祕' => '秘',
1727        '濱' => '浜',
1728        '拂' => '払',
1729        '佛' => '仏',
1730        '竝' => '並',
1731        '變' => '変',
1732        '邊' => '辺',
1733        '辯' => '弁',
1734        '辨' => '弁',
1735        '瓣' => '弁',
1736        '舖' => '舗',
1737        '寶' => '宝',
1738        '豐' => '豊',
1739        '沒' => '没',
1740        '飜' => '翻',
1741        '萬' => '万',
1742        '滿' => '満',
1743        '默' => '黙',
1744        '藥' => '薬',
1745        '譯' => '訳',
1746        '豫' => '予',
1747        '餘' => '余',
1748        '與' => '与',
1749        '譽' => '誉',
1750        '搖' => '揺',
1751        '樣' => '様',
1752        '謠' => '謡',
1753        '來' => '来',
1754        '亂' => '乱',
1755        '覽' => '覧',
1756        '龍' => '竜',
1757        '兩' => '両',
1758        '獵' => '猟',
1759        '綠' => '緑',
1760        '壘' => '塁',
1761        '禮' => '礼',
1762        '勞' => '労',
1763        '樓' => '楼',
1764        '灣' => '湾',
1765        _ => c,
1766    }
1767}
1768
1769fn is_variation_selector(c: char) -> bool {
1770    matches!(c, '\u{FE00}'..='\u{FE0F}' | '\u{E0100}'..='\u{E01EF}')
1771}
1772
1773fn is_symbol_or_punctuation(c: char) -> bool {
1774    !c.is_whitespace()
1775        && (c.is_ascii_punctuation()
1776            || matches!(
1777                c,
1778                '\u{2000}'..='\u{206F}'
1779                    | '\u{3000}'..='\u{303F}'
1780                    | '\u{FE10}'..='\u{FE1F}'
1781                    | '\u{FE30}'..='\u{FE4F}'
1782                    | '\u{FF01}'..='\u{FF0F}'
1783                    | '\u{FF1A}'..='\u{FF20}'
1784                    | '\u{FF3B}'..='\u{FF40}'
1785                    | '\u{FF5B}'..='\u{FF65}'
1786                    | '\u{FFE0}'..='\u{FFE6}'
1787            )
1788            || is_japanese_symbol(c))
1789}
1790
1791fn is_japanese_symbol(c: char) -> bool {
1792    matches!(
1793        c,
1794        '、' | '。'
1795            | '・'
1796            | '「'
1797            | '」'
1798            | '『'
1799            | '』'
1800            | '（'
1801            | '）'
1802            | '［'
1803            | '］'
1804            | '【'
1805            | '】'
1806            | '〜'
1807            | '～'
1808            | '…'
1809            | '※'
1810            | '〒'
1811            | '〆'
1812            | '〇'
1813            | '〃'
1814            | 'ゝ'
1815            | 'ゞ'
1816            | 'ヽ'
1817            | 'ヾ'
1818    )
1819}
1820
1821#[cfg(test)]
1822mod tests {
1823    use super::*;
1824    use proptest::prelude::*;
1825
1826    #[test]
1827    fn test_to_half_width() {
1828        assert_eq!(to_half_width("ＡＢＣ"), "ABC");
1829        assert_eq!(to_half_width("１２３"), "123");
1830        assert_eq!(to_half_width("！＠＃"), "!@#");
1831        assert_eq!(to_half_width("　"), " ");
1832        assert_eq!(to_half_width("Ｈｅｌｌｏ　Ｗｏｒｌｄ"), "Hello World");
1833        // Mixed content
1834        assert_eq!(to_half_width("ＡＢＣあいう"), "ABCあいう");
1835    }
1836
1837    #[test]
1838    fn test_to_full_width() {
1839        assert_eq!(to_full_width("ABC"), "ＡＢＣ");
1840        assert_eq!(to_full_width("123"), "１２３");
1841        assert_eq!(to_full_width("!@#"), "！＠＃");
1842        assert_eq!(to_full_width(" "), "　");
1843        assert_eq!(to_full_width("Hello World"), "Ｈｅｌｌｏ　Ｗｏｒｌｄ");
1844        // Mixed content
1845        assert_eq!(to_full_width("ABCあいう"), "ＡＢＣあいう");
1846    }
1847
1848    #[test]
1849    fn test_to_hiragana() {
1850        assert_eq!(to_hiragana("カタカナ"), "かたかな");
1851        assert_eq!(to_hiragana("コンニチハ"), "こんにちは");
1852        assert_eq!(to_hiragana("アイウエオ"), "あいうえお");
1853        assert_eq!(to_hiragana("ヴァイオリン"), "ゔぁいおりん");
1854        assert_eq!(
1855            to_hiragana("ヷヸヹヺ"),
1856            "わ\u{3099}ゐ\u{3099}ゑ\u{3099}を\u{3099}"
1857        );
1858        // Mixed content
1859        assert_eq!(to_hiragana("カタカナABC"), "かたかなABC");
1860    }
1861
1862    #[test]
1863    fn test_to_katakana() {
1864        assert_eq!(to_katakana("ひらがな"), "ヒラガナ");
1865        assert_eq!(to_katakana("こんにちは"), "コンニチハ");
1866        assert_eq!(to_katakana("あいうえお"), "アイウエオ");
1867        assert_eq!(to_katakana("ゔぁいおりん"), "ヴァイオリン");
1868        assert_eq!(
1869            to_katakana("わ\u{3099}ゐ\u{3099}ゑ\u{3099}を\u{3099}"),
1870            "ヷヸヹヺ"
1871        );
1872        assert_eq!(to_katakana("か\u{3099}は\u{309A}"), "ガパ");
1873        assert_eq!(to_katakana(&to_hiragana("ヷヸヹヺ")), "ヷヸヹヺ");
1874        // Mixed content
1875        assert_eq!(to_katakana("ひらがなABC"), "ヒラガナABC");
1876    }
1877
1878    #[test]
1879    fn test_roundtrip_full_half_width() {
1880        let original = "ABC123!@#";
1881        let full = to_full_width(original);
1882        let back = to_half_width(&full);
1883        assert_eq!(original, back);
1884    }
1885
1886    #[test]
1887    fn test_roundtrip_hiragana_katakana() {
1888        let original = "こんにちは";
1889        let katakana = to_katakana(original);
1890        let back = to_hiragana(&katakana);
1891        assert_eq!(original, back);
1892    }
1893
1894    #[test]
1895    fn test_empty_string() {
1896        assert_eq!(to_half_width(""), "");
1897        assert_eq!(to_full_width(""), "");
1898        assert_eq!(to_hiragana(""), "");
1899        assert_eq!(to_katakana(""), "");
1900    }
1901
1902    #[test]
1903    fn test_is_hiragana() {
1904        assert!(is_hiragana('あ'));
1905        assert!(is_hiragana('ん'));
1906        assert!(!is_hiragana('ア'));
1907        assert!(!is_hiragana('A'));
1908        assert!(!is_hiragana('漢'));
1909    }
1910
1911    #[test]
1912    fn test_is_katakana() {
1913        assert!(is_katakana('ア'));
1914        assert!(is_katakana('ン'));
1915        assert!(is_katakana('ー'));
1916        assert!(is_katakana('ヷ'));
1917        assert!(is_katakana('ヸ'));
1918        assert!(is_katakana('ヹ'));
1919        assert!(is_katakana('ヺ'));
1920        assert!(!is_katakana('あ'));
1921        assert!(!is_katakana('A'));
1922    }
1923
1924    #[test]
1925    fn test_is_half_width_katakana() {
1926        assert!(is_half_width_katakana('ｱ'));
1927        assert!(is_half_width_katakana('ﾝ'));
1928        assert!(is_half_width_katakana('ﾞ'));
1929        assert!(is_half_width_katakana('ﾟ'));
1930        assert!(!is_half_width_katakana('｡'));
1931        assert!(!is_half_width_katakana('｢'));
1932        assert!(!is_half_width_katakana('､'));
1933        assert!(!is_half_width_katakana('ア'));
1934        assert!(!is_half_width_katakana('A'));
1935    }
1936
1937    #[test]
1938    fn test_is_kanji() {
1939        assert!(is_kanji('漢'));
1940        assert!(is_kanji('字'));
1941        assert!(!is_kanji('あ'));
1942        assert!(!is_kanji('A'));
1943    }
1944
1945    #[test]
1946    fn test_is_full_width() {
1947        assert!(is_full_width('Ａ'));
1948        assert!(is_full_width('１'));
1949        assert!(is_full_width('ア'));
1950        assert!(is_full_width('あ'));
1951        assert!(is_full_width('漢'));
1952        assert!(is_full_width('、'));
1953        assert!(is_full_width('　'));
1954        assert!(!is_full_width('A'));
1955        assert!(!is_full_width('ｱ'));
1956    }
1957
1958    #[test]
1959    fn test_count_character_types() {
1960        let counts = count_character_types("あア漢ABC123ｱｲｳ");
1961        assert_eq!(counts.hiragana, 1);
1962        assert_eq!(counts.katakana, 1);
1963        assert_eq!(counts.kanji, 1);
1964        assert_eq!(counts.ascii, 6);
1965        assert_eq!(counts.half_width_katakana, 3);
1966    }
1967
1968    #[test]
1969    fn test_normalize_whitespace() {
1970        assert_eq!(normalize_whitespace("Hello　World"), "Hello World");
1971        assert_eq!(normalize_whitespace("A\t\t\tB"), "A B");
1972        assert_eq!(
1973            normalize_whitespace("  Multiple   Spaces  "),
1974            "Multiple Spaces"
1975        );
1976    }
1977
1978    #[test]
1979    fn test_half_width_katakana_to_full_width() {
1980        assert_eq!(half_width_katakana_to_full_width("ｶﾀｶﾅ"), "カタカナ");
1981        assert_eq!(half_width_katakana_to_full_width("ｶﾞｷﾞｸﾞｹﾞｺﾞ"), "ガギグゲゴ");
1982        assert_eq!(half_width_katakana_to_full_width("ﾊﾟﾋﾟﾌﾟﾍﾟﾎﾟ"), "パピプペポ");
1983        assert_eq!(half_width_katakana_to_full_width("ｳﾞﾜﾞｲﾞｴﾞｦﾞ"), "ヴヷヸヹヺ");
1984        assert_eq!(half_width_katakana_to_full_width("ｺﾝﾆﾁﾊ"), "コンニチハ");
1985    }
1986
1987    #[test]
1988    fn test_normalize_prolonged_sound() {
1989        assert_eq!(normalize_prolonged_sound("コーヒー"), "コーヒー");
1990        assert_eq!(normalize_prolonged_sound("コ〜ヒ〜"), "コーヒー");
1991        assert_eq!(normalize_prolonged_sound("ラーメン"), "ラーメン");
1992    }
1993
1994    #[test]
1995    fn test_expand_iteration_marks() {
1996        assert_eq!(expand_iteration_marks("いろゝ"), "いろろ");
1997        assert_eq!(expand_iteration_marks("かゞ"), "かが");
1998        assert_eq!(expand_iteration_marks("うゞ"), "うゔ");
1999        assert_eq!(expand_iteration_marks("いろゝゝ"), "いろろろ");
2000        assert_eq!(expand_iteration_marks("カヽヽ"), "カカカ");
2001        assert_eq!(expand_iteration_marks("トヽキ"), "トトキ");
2002        assert_eq!(expand_iteration_marks("カヾ"), "カガ");
2003        assert_eq!(expand_iteration_marks("ウヾ"), "ウヴ");
2004    }
2005
2006    #[test]
2007    fn test_full_width_katakana_to_half_width() {
2008        assert_eq!(full_width_katakana_to_half_width("カタカナ"), "ｶﾀｶﾅ");
2009        assert_eq!(full_width_katakana_to_half_width("ガギグ"), "ｶﾞｷﾞｸﾞ");
2010        assert_eq!(full_width_katakana_to_half_width("パピプ"), "ﾊﾟﾋﾟﾌﾟ");
2011        assert_eq!(full_width_katakana_to_half_width("ヷヸヹヺ"), "ﾜﾞｲﾞｴﾞｦﾞ");
2012        assert_eq!(full_width_katakana_to_half_width("日本語ABC"), "日本語ABC");
2013    }
2014
2015    #[test]
2016    fn test_dakuten_normalization() {
2017        assert_eq!(combine_dakuten("か\u{3099}ハ\u{309A}"), "がパ");
2018        assert_eq!(decompose_dakuten("がパ"), "か\u{3099}ハ\u{309A}");
2019        assert_eq!(combine_dakuten("e\u{301} か\u{3099}"), "e\u{301} が");
2020        assert_eq!(decompose_dakuten("é がパ"), "é か\u{3099}ハ\u{309A}");
2021        assert_eq!(
2022            combine_dakuten("ワ\u{3099}ヰ\u{3099}ヱ\u{3099}ヲ\u{3099}"),
2023            "ヷヸヹヺ"
2024        );
2025        assert_eq!(
2026            decompose_dakuten("ヷヸヹヺ"),
2027            "ワ\u{3099}ヰ\u{3099}ヱ\u{3099}ヲ\u{3099}"
2028        );
2029    }
2030
2031    #[test]
2032    fn test_unicode_normalization() {
2033        assert_eq!(normalize_nfkc("ＡＢＣ１２３ｶﾞ"), "ABC123ガ");
2034        assert_eq!(normalize_nfc("か\u{3099}"), "が");
2035        assert_eq!(normalize_nfd("が"), "か\u{3099}");
2036        assert_eq!(normalize_nfd("é が"), "e\u{301} か\u{3099}");
2037
2038        let options = NormalizeOptions {
2039            unicode: Some(UnicodeNormalizationForm::Nfd),
2040            ..NormalizeOptions::default()
2041        };
2042        assert_eq!(normalize_with_options("é が", &options), "e\u{301} が");
2043    }
2044
2045    #[test]
2046    fn test_normalize_punctuation_brackets_symbols() {
2047        assert_eq!(normalize_punctuation("A，B．C､D｡"), "A、B。C、D。");
2048        assert_eq!(normalize_brackets_and_quotes("(\"本文\")"), "（「本文」）");
2049        assert_eq!(
2050            normalize_brackets_and_quotes("“本文” ‘注’"),
2051            "「本文」 『注』"
2052        );
2053        assert_eq!(normalize_symbols("コ〜ヒ～ - − —"), "コーヒー - - -");
2054    }
2055
2056    #[test]
2057    fn test_old_kanji_and_variation_selectors() {
2058        assert_eq!(old_kanji_to_new("舊字體の國語"), "旧字体の国語");
2059        assert_eq!(remove_variation_selectors("葛\u{E0100}"), "葛");
2060    }
2061
2062    #[test]
2063    fn test_character_type_ratios_and_analysis() {
2064        let ratios = character_type_ratios("あア漢A");
2065        assert_eq!(ratios.hiragana, 0.25);
2066        assert_eq!(ratios.katakana, 0.25);
2067        assert_eq!(ratios.kanji, 0.25);
2068        assert_eq!(ratios.ascii, 0.25);
2069
2070        assert!(is_mostly_japanese("日本語です", 0.8));
2071        assert!(is_mostly_japanese("スーパー", 1.0));
2072        assert!(!is_mostly_japanese("ABC123", 0.5));
2073        assert!(has_mixed_scripts("日本語ABC"));
2074        assert_eq!(extract_japanese("ABC日本語123"), "日本語");
2075        assert_eq!(extract_japanese("ABCスーパー123"), "スーパー");
2076        assert_eq!(extract_ascii("ABC日本語123"), "ABC123");
2077        assert_eq!(remove_symbols("日本語、ABC!"), "日本語ABC");
2078        assert_eq!(remove_symbols("スーパー、コーヒー!"), "スーパーコーヒー");
2079        assert_eq!(remove_symbols("日本語！＃【ABC】※"), "日本語ABC");
2080        assert_eq!(remove_symbols("日本語　ABC DEF！"), "日本語　ABC DEF");
2081    }
2082
2083    #[test]
2084    fn test_normalize_default_and_options() {
2085        assert_eq!(normalize("ＡＢＣ　ｶﾞｷﾞｸﾞ，舊字體"), "ABC ガギグ、旧字体");
2086        assert_eq!(normalize("コ～ヒ～とラ〜メン"), "コーヒーとラーメン");
2087
2088        let options = NormalizeOptions {
2089            hiragana: true,
2090            half_width_ascii: true,
2091            punctuation: true,
2092            whitespace: WhitespaceMode::Collapse,
2093            ..NormalizeOptions::default()
2094        };
2095        assert_eq!(
2096            normalize_with_options("ＡＢＣ　カタカナ．", &options),
2097            "ABC かたかな。"
2098        );
2099
2100        let decompose_options = NormalizeOptions {
2101            decompose_dakuten: true,
2102            ..NormalizeOptions::default()
2103        };
2104        assert_eq!(
2105            normalize_with_options("ｶﾞ ﾊﾟ ヴ", &decompose_options),
2106            "カ\u{3099} ハ\u{309A} ウ\u{3099}"
2107        );
2108    }
2109
2110    #[test]
2111    fn test_normalizer_builder() {
2112        let normalizer = Normalizer::new()
2113            .hiragana(true)
2114            .half_width_ascii(true)
2115            .whitespace(WhitespaceMode::Collapse);
2116
2117        assert_eq!(normalizer.normalize("ＡＢＣ　カタカナ"), "ABC かたかな");
2118    }
2119
2120    #[test]
2121    fn test_normalizer_builder_last_direction_wins() {
2122        assert_eq!(
2123            Normalizer::new()
2124                .full_width_ascii(true)
2125                .half_width_ascii(true)
2126                .normalize("ＡＢＣ ABC"),
2127            "ABC ABC"
2128        );
2129        assert_eq!(
2130            Normalizer::new()
2131                .half_width_ascii(true)
2132                .full_width_ascii(true)
2133                .normalize("ＡＢＣ ABC"),
2134            "ＡＢＣ ＡＢＣ"
2135        );
2136        assert_eq!(
2137            Normalizer::new()
2138                .katakana(true)
2139                .hiragana(true)
2140                .normalize("カタカナ ひらがな"),
2141            "かたかな ひらがな"
2142        );
2143        assert_eq!(
2144            Normalizer::new()
2145                .half_width_katakana(false)
2146                .full_width_katakana(true)
2147                .normalize("カタカナ ｶﾀｶﾅ"),
2148            "ｶﾀｶﾅ ｶﾀｶﾅ"
2149        );
2150    }
2151
2152    #[test]
2153    fn test_normalizer_builder_controls_all_options() {
2154        let normalizer = Normalizer::new()
2155            .unicode(UnicodeNormalizationForm::Nfkc)
2156            .unicode_normalization(None)
2157            .half_width_ascii(false)
2158            .half_width_katakana(false)
2159            .combine_dakuten(false)
2160            .decompose_dakuten(true)
2161            .punctuation(false)
2162            .brackets(false)
2163            .symbols(false)
2164            .old_kanji(false)
2165            .remove_variation_selectors(false)
2166            .expand_iteration_marks(false)
2167            .preserve_ascii_tokens(true)
2168            .whitespace(WhitespaceMode::Preserve);
2169
2170        assert_eq!(normalizer.options().unicode, None);
2171        assert!(normalizer.options().decompose_dakuten);
2172        assert!(!normalizer.options().combine_dakuten);
2173        assert_eq!(
2174            normalizer.normalize("舊字體，(カゝ) か\u{3099}"),
2175            "舊字體，(カゝ) か\u{3099}"
2176        );
2177    }
2178
2179    #[test]
2180    fn test_preserve_ascii_tokens() {
2181        let options = NormalizeOptions {
2182            preserve_ascii_tokens: true,
2183            ..NormalizeOptions::default()
2184        };
2185
2186        assert_eq!(
2187            normalize_with_options("URL https://example.com/a,b と ＡＢＣ，", &options),
2188            "URL https://example.com/a,b と ABC、"
2189        );
2190        assert_eq!(
2191            normalize_with_options(
2192                "参照 (https://example.com/a,b), mail: user.name@example.com.",
2193                &options
2194            ),
2195            "参照 （https://example.com/a,b）、 mail: user.name@example.com。"
2196        );
2197        assert_eq!(
2198            normalize_with_options("価格 1,234.50，版 1.2.3.", &options),
2199            "価格 1,234.50、版 1.2.3."
2200        );
2201        assert_eq!(
2202            normalize_with_options("URL:https://example.com/a,b.", &options),
2203            "URL:https://example.com/a,b。"
2204        );
2205        assert_eq!(
2206            normalize_with_options("mail:user.name@example.com.", &options),
2207            "mail:user.name@example.com。"
2208        );
2209    }
2210
2211    proptest! {
2212        #[test]
2213        fn prop_full_half_ascii_roundtrip(input in "[ -~]*") {
2214            prop_assert_eq!(to_half_width(&to_full_width(&input)), input);
2215        }
2216
2217        #[test]
2218        fn prop_kana_roundtrip(input in "[ぁ-ゖ]*") {
2219            prop_assert_eq!(to_hiragana(&to_katakana(&input)), input);
2220        }
2221    }
2222}
japanese_text/lib.rs

japanese_text/
lib.rs