1#![warn(missing_docs)]
37#![deny(unsafe_code)]
38
39const HANGUL_BASE: u32 = 0xAC00;
41
42const HANGUL_END: u32 = 0xD7A3;
44
45#[allow(dead_code)]
47const CHOSEONG_COUNT: u32 = 19;
48
49const JUNGSEONG_COUNT: u32 = 21;
51
52const JONGSEONG_COUNT: u32 = 28;
54
55const CHOSEONG_LIST: [char; 19] = [
57 'ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ', 'ㅊ', 'ㅋ',
58 'ㅌ', 'ㅍ', 'ㅎ',
59];
60
61const JUNGSEONG_LIST: [char; 21] = [
63 'ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ', 'ㅝ', 'ㅞ',
64 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ',
65];
66
67const JONGSEONG_LIST: [Option<char>; 28] = [
69 None,
70 Some('ㄱ'),
71 Some('ㄲ'),
72 Some('ㄳ'),
73 Some('ㄴ'),
74 Some('ㄵ'),
75 Some('ㄶ'),
76 Some('ㄷ'),
77 Some('ㄹ'),
78 Some('ㄺ'),
79 Some('ㄻ'),
80 Some('ㄼ'),
81 Some('ㄽ'),
82 Some('ㄾ'),
83 Some('ㄿ'),
84 Some('ㅀ'),
85 Some('ㅁ'),
86 Some('ㅂ'),
87 Some('ㅄ'),
88 Some('ㅅ'),
89 Some('ㅆ'),
90 Some('ㅇ'),
91 Some('ㅈ'),
92 Some('ㅊ'),
93 Some('ㅋ'),
94 Some('ㅌ'),
95 Some('ㅍ'),
96 Some('ㅎ'),
97];
98
99#[inline]
120#[must_use]
121pub fn is_hangul_syllable(c: char) -> bool {
122 let code = c as u32;
123 (HANGUL_BASE..=HANGUL_END).contains(&code)
124}
125
126#[inline]
136#[must_use]
137pub fn is_hangul(c: char) -> bool {
138 is_hangul_syllable(c) || is_jamo(c)
139}
140
141#[inline]
145#[must_use]
146pub fn is_jamo(c: char) -> bool {
147 let code = c as u32;
148 (0x3131..=0x3163).contains(&code)
150}
151
152#[inline]
154#[must_use]
155pub fn is_choseong(c: char) -> bool {
156 CHOSEONG_LIST.contains(&c)
157}
158
159#[inline]
161#[must_use]
162pub fn is_jungseong(c: char) -> bool {
163 JUNGSEONG_LIST.contains(&c)
164}
165
166#[inline]
188#[must_use]
189pub fn has_jongseong(c: char) -> Option<bool> {
190 if !is_hangul_syllable(c) {
191 return None;
192 }
193 let code = c as u32 - HANGUL_BASE;
194 Some(code % JONGSEONG_COUNT != 0)
195}
196
197#[must_use]
220#[allow(clippy::similar_names)]
221pub fn decompose(c: char) -> Option<(char, char, Option<char>)> {
222 if !is_hangul_syllable(c) {
223 return None;
224 }
225
226 let code = c as u32 - HANGUL_BASE;
227
228 let jongseong_idx = code % JONGSEONG_COUNT;
229 let jungseong_idx = ((code - jongseong_idx) / JONGSEONG_COUNT) % JUNGSEONG_COUNT;
230 let choseong_idx = ((code - jongseong_idx) / JONGSEONG_COUNT) / JUNGSEONG_COUNT;
231
232 let cho = CHOSEONG_LIST[choseong_idx as usize];
233 let jung = JUNGSEONG_LIST[jungseong_idx as usize];
234 let jong = JONGSEONG_LIST[jongseong_idx as usize];
235
236 Some((cho, jung, jong))
237}
238
239#[must_use]
264#[allow(clippy::cast_possible_truncation, clippy::similar_names)]
265pub fn compose(choseong: char, jungseong: char, jongseong: Option<char>) -> Option<char> {
266 let choseong_idx = CHOSEONG_LIST.iter().position(|&c| c == choseong)? as u32;
267 let jungseong_idx = JUNGSEONG_LIST.iter().position(|&c| c == jungseong)? as u32;
268 let jongseong_idx = match jongseong {
269 None => 0,
270 Some(j) => JONGSEONG_LIST.iter().position(|&c| c == Some(j))? as u32,
271 };
272
273 let code = HANGUL_BASE
274 + (choseong_idx * JUNGSEONG_COUNT + jungseong_idx) * JONGSEONG_COUNT
275 + jongseong_idx;
276
277 char::from_u32(code)
278}
279
280#[must_use]
299pub fn decompose_str(s: &str) -> String {
300 let mut result = String::with_capacity(s.len() * 3);
301
302 for c in s.chars() {
303 if let Some((cho, jung, jong)) = decompose(c) {
304 result.push(cho);
305 result.push(jung);
306 if let Some(j) = jong {
307 result.push(j);
308 }
309 } else {
310 result.push(c);
311 }
312 }
313
314 result
315}
316
317#[must_use]
335pub fn compose_str(s: &str) -> String {
336 let chars: Vec<char> = s.chars().collect();
337 let mut result = String::with_capacity(s.len());
338 let mut i = 0;
339
340 while i < chars.len() {
341 if i + 1 < chars.len() && is_choseong(chars[i]) && is_jungseong(chars[i + 1]) {
343 let cho = chars[i];
344 let jung = chars[i + 1];
345
346 let jongseong = if i + 2 < chars.len() {
349 let potential_jongseong = chars[i + 2];
350 let is_potential_jongseong = JONGSEONG_LIST.contains(&Some(potential_jongseong));
351
352 if is_potential_jongseong {
353 if i + 3 < chars.len() && is_jungseong(chars[i + 3]) {
355 None
356 } else {
357 Some(potential_jongseong)
358 }
359 } else {
360 None
361 }
362 } else {
363 None
364 };
365
366 if let Some(c) = compose(cho, jung, jongseong) {
367 result.push(c);
368 i += if jongseong.is_some() { 3 } else { 2 };
369 } else {
370 result.push(chars[i]);
371 i += 1;
372 }
373 } else {
374 result.push(chars[i]);
375 i += 1;
376 }
377 }
378
379 result
380}
381
382#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
384pub enum CharType {
385 HangulSyllable,
387 HangulJamo,
389 Hanja,
391 Katakana,
393 Hiragana,
395 Alphabet,
397 Digit,
399 Whitespace,
401 Punctuation,
403 Other,
405}
406
407#[must_use]
417pub fn classify_char(c: char) -> CharType {
418 let code = c as u32;
419
420 if is_hangul_syllable(c) {
421 CharType::HangulSyllable
422 } else if is_jamo(c) {
423 CharType::HangulJamo
424 } else if (0x4E00..=0x9FFF).contains(&code) || (0x3400..=0x4DBF).contains(&code) {
425 CharType::Hanja
426 } else if (0x30A0..=0x30FF).contains(&code) {
427 CharType::Katakana
428 } else if (0x3040..=0x309F).contains(&code) {
429 CharType::Hiragana
430 } else if c.is_ascii_alphabetic() {
431 CharType::Alphabet
432 } else if c.is_ascii_digit() {
433 CharType::Digit
434 } else if c.is_whitespace() {
435 CharType::Whitespace
436 } else if c.is_ascii_punctuation() {
437 CharType::Punctuation
438 } else {
439 CharType::Other
440 }
441}
442
443#[cfg(test)]
444#[allow(clippy::similar_names, clippy::unwrap_used, clippy::expect_used)]
445mod tests {
446 use super::*;
447
448 #[test]
449 fn test_is_hangul_syllable() {
450 assert!(is_hangul_syllable('가'));
451 assert!(is_hangul_syllable('힣'));
452 assert!(is_hangul_syllable('한'));
453 assert!(!is_hangul_syllable('ㄱ'));
454 assert!(!is_hangul_syllable('a'));
455 assert!(!is_hangul_syllable('あ'));
456 }
457
458 #[test]
459 fn test_is_hangul() {
460 assert!(is_hangul('가'));
461 assert!(is_hangul('ㄱ'));
462 assert!(is_hangul('ㅏ'));
463 assert!(!is_hangul('a'));
464 }
465
466 #[test]
467 fn test_has_jongseong() {
468 assert_eq!(has_jongseong('한'), Some(true));
469 assert_eq!(has_jongseong('하'), Some(false));
470 assert_eq!(has_jongseong('글'), Some(true));
471 assert_eq!(has_jongseong('가'), Some(false));
472 assert_eq!(has_jongseong('a'), None);
473 }
474
475 #[test]
476 fn test_decompose() {
477 assert_eq!(decompose('가'), Some(('ㄱ', 'ㅏ', None)));
478 assert_eq!(decompose('한'), Some(('ㅎ', 'ㅏ', Some('ㄴ'))));
479 assert_eq!(decompose('글'), Some(('ㄱ', 'ㅡ', Some('ㄹ'))));
480 assert_eq!(decompose('힣'), Some(('ㅎ', 'ㅣ', Some('ㅎ'))));
481 assert_eq!(decompose('a'), None);
482 }
483
484 #[test]
485 fn test_compose() {
486 assert_eq!(compose('ㄱ', 'ㅏ', None), Some('가'));
487 assert_eq!(compose('ㅎ', 'ㅏ', Some('ㄴ')), Some('한'));
488 assert_eq!(compose('ㄱ', 'ㅡ', Some('ㄹ')), Some('글'));
489 assert_eq!(compose('ㅎ', 'ㅣ', Some('ㅎ')), Some('힣'));
490 }
491
492 #[test]
493 fn test_decompose_compose_roundtrip() {
494 let test_chars = ['가', '나', '다', '한', '글', '힣', '뷁'];
495 for c in test_chars {
496 let (cho, jung, jong) = decompose(c).unwrap();
497 let result = compose(cho, jung, jong).unwrap();
498 assert_eq!(c, result, "Roundtrip failed for '{c}'");
499 }
500 }
501
502 #[test]
503 fn test_decompose_str() {
504 assert_eq!(decompose_str("한글"), "ㅎㅏㄴㄱㅡㄹ");
505 assert_eq!(decompose_str("가나다"), "ㄱㅏㄴㅏㄷㅏ");
506 assert_eq!(decompose_str("Hello 한글"), "Hello ㅎㅏㄴㄱㅡㄹ");
507 }
508
509 #[test]
510 fn test_compose_str() {
511 assert_eq!(compose_str("ㅎㅏㄴㄱㅡㄹ"), "한글");
512 assert_eq!(compose_str("ㄱㅏㄴㅏㄷㅏ"), "가나다");
513 }
514
515 #[test]
516 fn test_classify_char() {
517 assert_eq!(classify_char('한'), CharType::HangulSyllable);
518 assert_eq!(classify_char('ㄱ'), CharType::HangulJamo);
519 assert_eq!(classify_char('韓'), CharType::Hanja);
520 assert_eq!(classify_char('ア'), CharType::Katakana);
521 assert_eq!(classify_char('あ'), CharType::Hiragana);
522 assert_eq!(classify_char('a'), CharType::Alphabet);
523 assert_eq!(classify_char('1'), CharType::Digit);
524 assert_eq!(classify_char(' '), CharType::Whitespace);
525 assert_eq!(classify_char('.'), CharType::Punctuation);
526 }
527
528 #[test]
529 fn test_is_jamo() {
530 assert!(is_jamo('ㄱ'));
532 assert!(is_jamo('ㄲ'));
533 assert!(is_jamo('ㅎ'));
534
535 assert!(is_jamo('ㅏ'));
537 assert!(is_jamo('ㅐ'));
538 assert!(is_jamo('ㅣ'));
539
540 assert!(is_jamo('ㄳ'));
542 assert!(is_jamo('ㄵ'));
543 assert!(is_jamo('ㅄ'));
544
545 assert!(!is_jamo('가'));
547 assert!(!is_jamo('a'));
548 assert!(!is_jamo('1'));
549 assert!(!is_jamo(' '));
550 }
551
552 #[test]
553 fn test_is_choseong() {
554 assert!(is_choseong('ㄱ'));
556 assert!(is_choseong('ㄲ'));
557 assert!(is_choseong('ㄴ'));
558 assert!(is_choseong('ㄷ'));
559 assert!(is_choseong('ㄸ'));
560 assert!(is_choseong('ㅎ'));
561
562 assert!(!is_choseong('ㅏ')); assert!(!is_choseong('ㄳ')); assert!(!is_choseong('가')); assert!(!is_choseong('a'));
567 }
568
569 #[test]
570 fn test_is_jungseong() {
571 assert!(is_jungseong('ㅏ'));
573 assert!(is_jungseong('ㅐ'));
574 assert!(is_jungseong('ㅘ'));
575 assert!(is_jungseong('ㅣ'));
576
577 assert!(!is_jungseong('ㄱ')); assert!(!is_jungseong('ㄳ')); assert!(!is_jungseong('가')); assert!(!is_jungseong('a'));
582 }
583
584 #[test]
585 fn test_boundary_hangul_syllables() {
586 assert!(is_hangul_syllable('가'));
588 assert_eq!(decompose('가'), Some(('ㄱ', 'ㅏ', None)));
589
590 assert!(is_hangul_syllable('힣'));
592 assert_eq!(decompose('힣'), Some(('ㅎ', 'ㅣ', Some('ㅎ'))));
593
594 let before_first = char::from_u32(0xAC00 - 1).unwrap();
596 assert!(!is_hangul_syllable(before_first));
597
598 let after_last = char::from_u32(0xD7A3 + 1).unwrap();
600 assert!(!is_hangul_syllable(after_last));
601 }
602
603 #[test]
604 fn test_compose_invalid_jamo() {
605 assert_eq!(compose('ㅏ', 'ㅏ', None), None);
607
608 assert_eq!(compose('ㄱ', 'ㄱ', None), None);
610
611 assert_eq!(compose('ㄱ', 'ㅏ', Some('ㅏ')), None);
613
614 assert_eq!(compose('a', 'ㅏ', None), None);
616 assert_eq!(compose('ㄱ', 'b', None), None);
617 assert_eq!(compose('ㄱ', 'ㅏ', Some('c')), None);
618 }
619
620 #[test]
621 fn test_decompose_all_choseong() {
622 let expected_choseong = [
623 'ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ',
624 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ',
625 ];
626
627 for &cho in &expected_choseong {
628 let syllable = compose(cho, 'ㅏ', None).unwrap();
629 let (decomposed_cho, _, _) = decompose(syllable).unwrap();
630 assert_eq!(cho, decomposed_cho, "Choseong mismatch for {syllable}");
631 }
632 }
633
634 #[test]
635 fn test_decompose_all_jungseong() {
636 let expected_jungseong = [
637 'ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ',
638 'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ',
639 ];
640
641 for &jung in &expected_jungseong {
642 let syllable = compose('ㄱ', jung, None).unwrap();
643 let (_, decomposed_jung, _) = decompose(syllable).unwrap();
644 assert_eq!(jung, decomposed_jung, "Jungseong mismatch for {syllable}");
645 }
646 }
647
648 #[test]
649 fn test_decompose_all_jongseong() {
650 let expected_jongseong = [
651 Some('ㄱ'),
652 Some('ㄲ'),
653 Some('ㄳ'),
654 Some('ㄴ'),
655 Some('ㄵ'),
656 Some('ㄶ'),
657 Some('ㄷ'),
658 Some('ㄹ'),
659 Some('ㄺ'),
660 Some('ㄻ'),
661 Some('ㄼ'),
662 Some('ㄽ'),
663 Some('ㄾ'),
664 Some('ㄿ'),
665 Some('ㅀ'),
666 Some('ㅁ'),
667 Some('ㅂ'),
668 Some('ㅄ'),
669 Some('ㅅ'),
670 Some('ㅆ'),
671 Some('ㅇ'),
672 Some('ㅈ'),
673 Some('ㅊ'),
674 Some('ㅋ'),
675 Some('ㅌ'),
676 Some('ㅍ'),
677 Some('ㅎ'),
678 ];
679
680 for &jong in &expected_jongseong {
681 let syllable = compose('ㄱ', 'ㅏ', jong).unwrap();
682 let (_, _, decomposed_jong) = decompose(syllable).unwrap();
683 assert_eq!(jong, decomposed_jong, "Jongseong mismatch for {syllable}");
684 }
685 }
686
687 #[test]
688 fn test_complex_jongseong() {
689 assert_eq!(compose('ㄱ', 'ㅏ', Some('ㄳ')), Some('갃'));
691 assert_eq!(decompose('갃'), Some(('ㄱ', 'ㅏ', Some('ㄳ'))));
692
693 assert_eq!(compose('ㄱ', 'ㅏ', Some('ㄵ')), Some('갅'));
694 assert_eq!(decompose('갅'), Some(('ㄱ', 'ㅏ', Some('ㄵ'))));
695
696 assert_eq!(compose('ㄱ', 'ㅏ', Some('ㅄ')), Some('값'));
697 assert_eq!(decompose('값'), Some(('ㄱ', 'ㅏ', Some('ㅄ'))));
698 }
699
700 #[test]
701 fn test_decompose_str_empty() {
702 assert_eq!(decompose_str(""), "");
703 }
704
705 #[test]
706 fn test_decompose_str_no_hangul() {
707 assert_eq!(decompose_str("Hello World"), "Hello World");
708 assert_eq!(decompose_str("123"), "123");
709 assert_eq!(decompose_str("!@#$%"), "!@#$%");
710 }
711
712 #[test]
713 fn test_decompose_str_mixed() {
714 assert_eq!(decompose_str("안녕하세요"), "ㅇㅏㄴㄴㅕㅇㅎㅏㅅㅔㅇㅛ");
715 assert_eq!(decompose_str("Hello 안녕"), "Hello ㅇㅏㄴㄴㅕㅇ");
716 assert_eq!(decompose_str("한글123"), "ㅎㅏㄴㄱㅡㄹ123");
717 }
718
719 #[test]
720 fn test_compose_str_empty() {
721 assert_eq!(compose_str(""), "");
722 }
723
724 #[test]
725 fn test_compose_str_no_jamo() {
726 assert_eq!(compose_str("Hello"), "Hello");
727 assert_eq!(compose_str("123"), "123");
728 }
729
730 #[test]
731 fn test_compose_str_incomplete_jamo() {
732 assert_eq!(compose_str("ㄱ"), "ㄱ");
734
735 assert_eq!(compose_str("ㅏ"), "ㅏ");
737
738 assert_eq!(compose_str("ㄱㄴ"), "ㄱㄴ");
740 }
741
742 #[test]
743 fn test_compose_str_with_jongseong_ambiguity() {
744 assert_eq!(compose_str("ㄱㅏㄴㄴㅕㅇ"), "간녕");
747
748 assert_eq!(compose_str("ㄱㅏㄹㄱㅏ"), "갈가");
750 }
751
752 #[test]
753 fn test_decompose_compose_str_roundtrip() {
754 let test_strings = ["한글", "안녕하세요", "대한민국", "가나다라마바사"];
755
756 for &s in &test_strings {
757 let decomposed = decompose_str(s);
758 let recomposed = compose_str(&decomposed);
759 assert_eq!(s, recomposed, "Roundtrip failed for '{s}'");
760 }
761 }
762
763 #[test]
764 fn test_has_jongseong_all_combinations() {
765 assert_eq!(has_jongseong('가'), Some(false));
767 assert_eq!(has_jongseong('나'), Some(false));
768 assert_eq!(has_jongseong('다'), Some(false));
769
770 assert_eq!(has_jongseong('각'), Some(true));
772 assert_eq!(has_jongseong('난'), Some(true));
773 assert_eq!(has_jongseong('닭'), Some(true));
774
775 assert_eq!(has_jongseong('갃'), Some(true)); assert_eq!(has_jongseong('닭'), Some(true)); }
779
780 #[test]
781 fn test_classify_char_edge_cases() {
782 assert_eq!(classify_char('\u{3400}'), CharType::Hanja);
784 assert_eq!(classify_char('\u{4DBF}'), CharType::Hanja);
785
786 assert_eq!(classify_char('\u{4E00}'), CharType::Hanja);
788 assert_eq!(classify_char('\u{9FFF}'), CharType::Hanja);
789
790 assert_eq!(classify_char('\u{30A0}'), CharType::Katakana);
792 assert_eq!(classify_char('\u{30FF}'), CharType::Katakana);
793
794 assert_eq!(classify_char('\u{3040}'), CharType::Hiragana);
796 assert_eq!(classify_char('\u{309F}'), CharType::Hiragana);
797
798 assert_eq!(classify_char('!'), CharType::Punctuation);
800 assert_eq!(classify_char(','), CharType::Punctuation);
801 assert_eq!(classify_char(';'), CharType::Punctuation);
802
803 assert_eq!(classify_char('\t'), CharType::Whitespace);
805 assert_eq!(classify_char('\n'), CharType::Whitespace);
806
807 assert_eq!(classify_char('™'), CharType::Other);
809 assert_eq!(classify_char('€'), CharType::Other);
810 }
811
812 #[test]
813 fn test_all_hangul_syllables_decompose_compose() {
814 let sample_codes: Vec<u32> = vec![
816 0xAC00, 0xAC01, 0xB098, 0xB2E4, 0xB77C, 0xB9C8, 0xBC14, 0xC0AC, 0xC544, 0xC790, 0xCC28, 0xCE74, 0xD0C0, 0xD30C, 0xD558, 0xD7A3, ];
833
834 for code in sample_codes {
835 let c = char::from_u32(code).unwrap();
836 let (cho, jung, jong) = decompose(c).unwrap();
837 let recomposed = compose(cho, jung, jong).unwrap();
838 assert_eq!(c, recomposed, "Failed for U+{code:04X} ({c})");
839 }
840 }
841
842 #[test]
843 fn test_compose_str_real_words() {
844 assert_eq!(compose_str("ㅎㅏㄴㄱㅡㄹ"), "한글");
845 assert_eq!(compose_str("ㅇㅏㄴㄴㅕㅇㅎㅏㅅㅔㅇㅛ"), "안녕하세요");
846 assert_eq!(compose_str("ㄷㅐㅎㅏㄴㅁㅣㄴㄱㅜㄱ"), "대한민국");
847 }
848}