1pub fn to_half_width(input: &str) -> String {
45 input
46 .chars()
47 .map(|c| {
48 match c {
49 ' ' => ' ',
51 '\u{FF01}'..='\u{FF5E}' => {
53 char::from_u32(c as u32 - 0xFF01 + 0x0021).unwrap_or(c)
54 }
55 _ => c,
57 }
58 })
59 .collect()
60}
61
62pub fn to_full_width(input: &str) -> String {
78 input
79 .chars()
80 .map(|c| {
81 match c {
82 ' ' => ' ',
84 '\u{0021}'..='\u{007E}' => {
86 char::from_u32(c as u32 - 0x0021 + 0xFF01).unwrap_or(c)
87 }
88 _ => c,
90 }
91 })
92 .collect()
93}
94
95pub fn to_hiragana(input: &str) -> String {
110 input
111 .chars()
112 .map(|c| {
113 match c {
114 '\u{30A1}'..='\u{30F6}' => {
116 char::from_u32(c as u32 - 0x30A1 + 0x3041).unwrap_or(c)
117 }
118 _ => c,
120 }
121 })
122 .collect()
123}
124
125pub fn to_katakana(input: &str) -> String {
140 input
141 .chars()
142 .map(|c| {
143 match c {
144 '\u{3041}'..='\u{3096}' => {
146 char::from_u32(c as u32 - 0x3041 + 0x30A1).unwrap_or(c)
147 }
148 _ => c,
150 }
151 })
152 .collect()
153}
154
155pub fn is_hiragana(c: char) -> bool {
167 matches!(c, '\u{3041}'..='\u{3096}')
168}
169
170pub fn is_katakana(c: char) -> bool {
182 matches!(c, '\u{30A1}'..='\u{30F6}')
183}
184
185pub fn is_half_width_katakana(c: char) -> bool {
197 matches!(c, '\u{FF61}'..='\u{FF9F}')
198}
199
200pub fn is_kanji(c: char) -> bool {
213 matches!(c, '\u{4E00}'..='\u{9FFF}')
214}
215
216pub fn is_full_width(c: char) -> bool {
228 matches!(c, '\u{FF01}'..='\u{FF5E}' | ' ')
229}
230
231#[derive(Debug, Clone, PartialEq, Eq)]
245pub struct CharacterTypes {
246 pub hiragana: usize,
247 pub katakana: usize,
248 pub half_width_katakana: usize,
249 pub kanji: usize,
250 pub ascii: usize,
251 pub full_width: usize,
252 pub other: usize,
253}
254
255pub fn count_character_types(input: &str) -> CharacterTypes {
256 let mut counts = CharacterTypes {
257 hiragana: 0,
258 katakana: 0,
259 half_width_katakana: 0,
260 kanji: 0,
261 ascii: 0,
262 full_width: 0,
263 other: 0,
264 };
265
266 for c in input.chars() {
267 if is_hiragana(c) {
268 counts.hiragana += 1;
269 } else if is_katakana(c) {
270 counts.katakana += 1;
271 } else if is_half_width_katakana(c) {
272 counts.half_width_katakana += 1;
273 } else if is_kanji(c) {
274 counts.kanji += 1;
275 } else if c.is_ascii() {
276 counts.ascii += 1;
277 } else if is_full_width(c) {
278 counts.full_width += 1;
279 } else {
280 counts.other += 1;
281 }
282 }
283
284 counts
285}
286
287pub fn normalize_whitespace(input: &str) -> String {
298 input
299 .chars()
300 .map(|c| {
301 if c.is_whitespace() || c == ' ' {
302 ' '
303 } else {
304 c
305 }
306 })
307 .collect::<String>()
308 .split_whitespace()
309 .collect::<Vec<_>>()
310 .join(" ")
311}
312
313pub fn half_width_katakana_to_full_width(input: &str) -> String {
327 let chars: Vec<char> = input.chars().collect();
328 let mut result = String::new();
329 let mut i = 0;
330
331 while i < chars.len() {
332 let c = chars[i];
333
334 let next = if i + 1 < chars.len() {
336 Some(chars[i + 1])
337 } else {
338 None
339 };
340
341 match (c, next) {
342 ('カ', Some('゙')) => { result.push('ガ'); i += 2; }
344 ('キ', Some('゙')) => { result.push('ギ'); i += 2; }
345 ('ク', Some('゙')) => { result.push('グ'); i += 2; }
346 ('ケ', Some('゙')) => { result.push('ゲ'); i += 2; }
347 ('コ', Some('゙')) => { result.push('ゴ'); i += 2; }
348 ('サ', Some('゙')) => { result.push('ザ'); i += 2; }
349 ('シ', Some('゙')) => { result.push('ジ'); i += 2; }
350 ('ス', Some('゙')) => { result.push('ズ'); i += 2; }
351 ('セ', Some('゙')) => { result.push('ゼ'); i += 2; }
352 ('ソ', Some('゙')) => { result.push('ゾ'); i += 2; }
353 ('タ', Some('゙')) => { result.push('ダ'); i += 2; }
354 ('チ', Some('゙')) => { result.push('ヂ'); i += 2; }
355 ('ツ', Some('゙')) => { result.push('ヅ'); i += 2; }
356 ('テ', Some('゙')) => { result.push('デ'); i += 2; }
357 ('ト', Some('゙')) => { result.push('ド'); i += 2; }
358 ('ハ', Some('゙')) => { result.push('バ'); i += 2; }
359 ('ヒ', Some('゙')) => { result.push('ビ'); i += 2; }
360 ('フ', Some('゙')) => { result.push('ブ'); i += 2; }
361 ('ヘ', Some('゙')) => { result.push('ベ'); i += 2; }
362 ('ホ', Some('゙')) => { result.push('ボ'); i += 2; }
363 ('ウ', Some('゙')) => { result.push('ヴ'); i += 2; }
364
365 ('ハ', Some('゚')) => { result.push('パ'); i += 2; }
367 ('ヒ', Some('゚')) => { result.push('ピ'); i += 2; }
368 ('フ', Some('゚')) => { result.push('プ'); i += 2; }
369 ('ヘ', Some('゚')) => { result.push('ペ'); i += 2; }
370 ('ホ', Some('゚')) => { result.push('ポ'); i += 2; }
371
372 _ => {
374 let full = match c {
375 'ヲ' => 'ヲ', 'ァ' => 'ァ', 'ィ' => 'ィ', 'ゥ' => 'ゥ', 'ェ' => 'ェ', 'ォ' => 'ォ',
376 'ャ' => 'ャ', 'ュ' => 'ュ', 'ョ' => 'ョ', 'ッ' => 'ッ', 'ー' => 'ー',
377 'ア' => 'ア', 'イ' => 'イ', 'ウ' => 'ウ', 'エ' => 'エ', 'オ' => 'オ',
378 'カ' => 'カ', 'キ' => 'キ', 'ク' => 'ク', 'ケ' => 'ケ', 'コ' => 'コ',
379 'サ' => 'サ', 'シ' => 'シ', 'ス' => 'ス', 'セ' => 'セ', 'ソ' => 'ソ',
380 'タ' => 'タ', 'チ' => 'チ', 'ツ' => 'ツ', 'テ' => 'テ', 'ト' => 'ト',
381 'ナ' => 'ナ', 'ニ' => 'ニ', 'ヌ' => 'ヌ', 'ネ' => 'ネ', 'ノ' => 'ノ',
382 'ハ' => 'ハ', 'ヒ' => 'ヒ', 'フ' => 'フ', 'ヘ' => 'ヘ', 'ホ' => 'ホ',
383 'マ' => 'マ', 'ミ' => 'ミ', 'ム' => 'ム', 'メ' => 'メ', 'モ' => 'モ',
384 'ヤ' => 'ヤ', 'ユ' => 'ユ', 'ヨ' => 'ヨ',
385 'ラ' => 'ラ', 'リ' => 'リ', 'ル' => 'ル', 'レ' => 'レ', 'ロ' => 'ロ',
386 'ワ' => 'ワ', 'ン' => 'ン',
387 '。' => '。', '「' => '「', '」' => '」', '、' => '、', '・' => '・',
388 _ => c,
389 };
390 result.push(full);
391 i += 1;
392 }
393 }
394 }
395
396 result
397}
398
399pub fn normalize_prolonged_sound(input: &str) -> String {
410 input
411 .chars()
412 .map(|c| match c {
413 '〜' | '~' => 'ー',
414 _ => c,
415 })
416 .collect()
417}
418
419pub fn expand_iteration_marks(input: &str) -> String {
432 let chars: Vec<char> = input.chars().collect();
433 let mut result = String::new();
434
435 for (i, &c) in chars.iter().enumerate() {
436 match c {
437 'ゝ' => {
439 if i > 0 {
440 result.push(chars[i - 1]);
441 } else {
442 result.push(c);
443 }
444 }
445 'ゞ' => {
447 if i > 0 {
448 let prev = chars[i - 1];
449 let voiced = add_dakuten(prev);
450 result.push(voiced);
451 } else {
452 result.push(c);
453 }
454 }
455 'ヽ' => {
457 if i > 0 {
458 result.push(chars[i - 1]);
459 } else {
460 result.push(c);
461 }
462 }
463 'ヾ' => {
465 if i > 0 {
466 let prev = chars[i - 1];
467 let voiced = add_dakuten(prev);
468 result.push(voiced);
469 } else {
470 result.push(c);
471 }
472 }
473 _ => result.push(c),
474 }
475 }
476
477 result
478}
479
480fn add_dakuten(c: char) -> char {
482 match c {
483 'か' => 'が', 'き' => 'ぎ', 'く' => 'ぐ', 'け' => 'げ', 'こ' => 'ご',
485 'さ' => 'ざ', 'し' => 'じ', 'す' => 'ず', 'せ' => 'ぜ', 'そ' => 'ぞ',
486 'た' => 'だ', 'ち' => 'ぢ', 'つ' => 'づ', 'て' => 'で', 'と' => 'ど',
487 'は' => 'ば', 'ひ' => 'び', 'ふ' => 'ぶ', 'へ' => 'べ', 'ほ' => 'ぼ',
488 'カ' => 'ガ', 'キ' => 'ギ', 'ク' => 'グ', 'ケ' => 'ゲ', 'コ' => 'ゴ',
490 'サ' => 'ザ', 'シ' => 'ジ', 'ス' => 'ズ', 'セ' => 'ゼ', 'ソ' => 'ゾ',
491 'タ' => 'ダ', 'チ' => 'ヂ', 'ツ' => 'ヅ', 'テ' => 'デ', 'ト' => 'ド',
492 'ハ' => 'バ', 'ヒ' => 'ビ', 'フ' => 'ブ', 'ヘ' => 'ベ', 'ホ' => 'ボ',
493 _ => c,
494 }
495}
496
497#[cfg(test)]
498mod tests {
499 use super::*;
500
501 #[test]
502 fn test_to_half_width() {
503 assert_eq!(to_half_width("ABC"), "ABC");
504 assert_eq!(to_half_width("123"), "123");
505 assert_eq!(to_half_width("!@#"), "!@#");
506 assert_eq!(to_half_width(" "), " ");
507 assert_eq!(to_half_width("Hello World"), "Hello World");
508 assert_eq!(to_half_width("ABCあいう"), "ABCあいう");
510 }
511
512 #[test]
513 fn test_to_full_width() {
514 assert_eq!(to_full_width("ABC"), "ABC");
515 assert_eq!(to_full_width("123"), "123");
516 assert_eq!(to_full_width("!@#"), "!@#");
517 assert_eq!(to_full_width(" "), " ");
518 assert_eq!(to_full_width("Hello World"), "Hello World");
519 assert_eq!(to_full_width("ABCあいう"), "ABCあいう");
521 }
522
523 #[test]
524 fn test_to_hiragana() {
525 assert_eq!(to_hiragana("カタカナ"), "かたかな");
526 assert_eq!(to_hiragana("コンニチハ"), "こんにちは");
527 assert_eq!(to_hiragana("アイウエオ"), "あいうえお");
528 assert_eq!(to_hiragana("ヴァイオリン"), "ゔぁいおりん");
529 assert_eq!(to_hiragana("カタカナABC"), "かたかなABC");
531 }
532
533 #[test]
534 fn test_to_katakana() {
535 assert_eq!(to_katakana("ひらがな"), "ヒラガナ");
536 assert_eq!(to_katakana("こんにちは"), "コンニチハ");
537 assert_eq!(to_katakana("あいうえお"), "アイウエオ");
538 assert_eq!(to_katakana("ゔぁいおりん"), "ヴァイオリン");
539 assert_eq!(to_katakana("ひらがなABC"), "ヒラガナABC");
541 }
542
543 #[test]
544 fn test_roundtrip_full_half_width() {
545 let original = "ABC123!@#";
546 let full = to_full_width(original);
547 let back = to_half_width(&full);
548 assert_eq!(original, back);
549 }
550
551 #[test]
552 fn test_roundtrip_hiragana_katakana() {
553 let original = "こんにちは";
554 let katakana = to_katakana(original);
555 let back = to_hiragana(&katakana);
556 assert_eq!(original, back);
557 }
558
559 #[test]
560 fn test_empty_string() {
561 assert_eq!(to_half_width(""), "");
562 assert_eq!(to_full_width(""), "");
563 assert_eq!(to_hiragana(""), "");
564 assert_eq!(to_katakana(""), "");
565 }
566
567 #[test]
568 fn test_is_hiragana() {
569 assert_eq!(is_hiragana('あ'), true);
570 assert_eq!(is_hiragana('ん'), true);
571 assert_eq!(is_hiragana('ア'), false);
572 assert_eq!(is_hiragana('A'), false);
573 assert_eq!(is_hiragana('漢'), false);
574 }
575
576 #[test]
577 fn test_is_katakana() {
578 assert_eq!(is_katakana('ア'), true);
579 assert_eq!(is_katakana('ン'), true);
580 assert_eq!(is_katakana('あ'), false);
581 assert_eq!(is_katakana('A'), false);
582 }
583
584 #[test]
585 fn test_is_half_width_katakana() {
586 assert_eq!(is_half_width_katakana('ア'), true);
587 assert_eq!(is_half_width_katakana('ン'), true);
588 assert_eq!(is_half_width_katakana('ア'), false);
589 assert_eq!(is_half_width_katakana('A'), false);
590 }
591
592 #[test]
593 fn test_is_kanji() {
594 assert_eq!(is_kanji('漢'), true);
595 assert_eq!(is_kanji('字'), true);
596 assert_eq!(is_kanji('あ'), false);
597 assert_eq!(is_kanji('A'), false);
598 }
599
600 #[test]
601 fn test_is_full_width() {
602 assert_eq!(is_full_width('A'), true);
603 assert_eq!(is_full_width('1'), true);
604 assert_eq!(is_full_width(' '), true);
605 assert_eq!(is_full_width('A'), false);
606 }
607
608 #[test]
609 fn test_count_character_types() {
610 let counts = count_character_types("あア漢ABC123アイウ");
611 assert_eq!(counts.hiragana, 1);
612 assert_eq!(counts.katakana, 1);
613 assert_eq!(counts.kanji, 1);
614 assert_eq!(counts.ascii, 6);
615 assert_eq!(counts.half_width_katakana, 3);
616 }
617
618 #[test]
619 fn test_normalize_whitespace() {
620 assert_eq!(normalize_whitespace("Hello World"), "Hello World");
621 assert_eq!(normalize_whitespace("A\t\t\tB"), "A B");
622 assert_eq!(normalize_whitespace(" Multiple Spaces "), "Multiple Spaces");
623 }
624
625 #[test]
626 fn test_half_width_katakana_to_full_width() {
627 assert_eq!(half_width_katakana_to_full_width("カタカナ"), "カタカナ");
628 assert_eq!(half_width_katakana_to_full_width("ガギグゲゴ"), "ガギグゲゴ");
629 assert_eq!(half_width_katakana_to_full_width("パピプペポ"), "パピプペポ");
630 assert_eq!(half_width_katakana_to_full_width("コンニチハ"), "コンニチハ");
631 }
632
633 #[test]
634 fn test_normalize_prolonged_sound() {
635 assert_eq!(normalize_prolonged_sound("コーヒー"), "コーヒー");
636 assert_eq!(normalize_prolonged_sound("コ〜ヒ〜"), "コーヒー");
637 assert_eq!(normalize_prolonged_sound("ラーメン"), "ラーメン");
638 }
639
640 #[test]
641 fn test_expand_iteration_marks() {
642 assert_eq!(expand_iteration_marks("いろゝ"), "いろろ");
643 assert_eq!(expand_iteration_marks("かゞ"), "かが");
644 assert_eq!(expand_iteration_marks("トヽキ"), "トトキ");
645 assert_eq!(expand_iteration_marks("カヾ"), "カガ");
646 }
647}