Skip to main content

mecab_ko_hangul/
lib.rs

1//! # mecab-ko-hangul
2//!
3//! 한글 자소(Jamo) 처리를 위한 유틸리티 라이브러리입니다.
4//!
5//! ## Features
6//!
7//! - 한글 자모 분리/결합
8//! - 초/중/종성 추출
9//! - 한글 판별 함수
10//! - 종성 유무 판별
11//!
12//! ## Example
13//!
14//! ```rust
15//! use mecab_ko_hangul::{decompose, compose, is_hangul, has_jongseong};
16//!
17//! // 자모 분리
18//! let (cho, jung, jong) = decompose('한').unwrap();
19//! assert_eq!(cho, 'ㅎ');
20//! assert_eq!(jung, 'ㅏ');
21//! assert_eq!(jong, Some('ㄴ'));
22//!
23//! // 자모 결합
24//! let c = compose('ㅎ', 'ㅏ', Some('ㄴ')).unwrap();
25//! assert_eq!(c, '한');
26//!
27//! // 한글 판별
28//! assert!(is_hangul('가'));
29//! assert!(!is_hangul('a'));
30//!
31//! // 종성 판별
32//! assert_eq!(has_jongseong('한'), Some(true));
33//! assert_eq!(has_jongseong('하'), Some(false));
34//! ```
35
36#![warn(missing_docs)]
37#![deny(unsafe_code)]
38
39/// 한글 음절의 시작 코드포인트 (가)
40const HANGUL_BASE: u32 = 0xAC00;
41
42/// 한글 음절의 끝 코드포인트 (힣)
43const HANGUL_END: u32 = 0xD7A3;
44
45/// 초성 개수 (19개)
46#[allow(dead_code)]
47const CHOSEONG_COUNT: u32 = 19;
48
49/// 중성 개수 (21개)
50const JUNGSEONG_COUNT: u32 = 21;
51
52/// 종성 개수 (28개, 종성 없음 포함)
53const JONGSEONG_COUNT: u32 = 28;
54
55/// 초성 목록 (19개)
56const CHOSEONG_LIST: [char; 19] = [
57    'ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ', 'ㅊ', 'ㅋ',
58    'ㅌ', 'ㅍ', 'ㅎ',
59];
60
61/// 중성 목록 (21개)
62const JUNGSEONG_LIST: [char; 21] = [
63    'ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ', 'ㅝ', 'ㅞ',
64    'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ',
65];
66
67/// 종성 목록 (28개, 첫 번째는 종성 없음)
68const JONGSEONG_LIST: [Option<char>; 28] = [
69    None,
70    Some('ㄱ'),
71    Some('ㄲ'),
72    Some('ㄳ'),
73    Some('ㄴ'),
74    Some('ㄵ'),
75    Some('ㄶ'),
76    Some('ㄷ'),
77    Some('ㄹ'),
78    Some('ㄺ'),
79    Some('ㄻ'),
80    Some('ㄼ'),
81    Some('ㄽ'),
82    Some('ㄾ'),
83    Some('ㄿ'),
84    Some('ㅀ'),
85    Some('ㅁ'),
86    Some('ㅂ'),
87    Some('ㅄ'),
88    Some('ㅅ'),
89    Some('ㅆ'),
90    Some('ㅇ'),
91    Some('ㅈ'),
92    Some('ㅊ'),
93    Some('ㅋ'),
94    Some('ㅌ'),
95    Some('ㅍ'),
96    Some('ㅎ'),
97];
98
99/// 주어진 문자가 한글 음절인지 확인합니다.
100///
101/// # Arguments
102///
103/// * `c` - 확인할 문자
104///
105/// # Returns
106///
107/// 한글 음절이면 `true`, 아니면 `false`
108///
109/// # Example
110///
111/// ```rust
112/// use mecab_ko_hangul::is_hangul_syllable;
113///
114/// assert!(is_hangul_syllable('가'));
115/// assert!(is_hangul_syllable('힣'));
116/// assert!(!is_hangul_syllable('ㄱ')); // 자모는 false
117/// assert!(!is_hangul_syllable('a'));
118/// ```
119#[inline]
120#[must_use]
121pub fn is_hangul_syllable(c: char) -> bool {
122    let code = c as u32;
123    (HANGUL_BASE..=HANGUL_END).contains(&code)
124}
125
126/// 주어진 문자가 한글(음절 또는 자모)인지 확인합니다.
127///
128/// # Arguments
129///
130/// * `c` - 확인할 문자
131///
132/// # Returns
133///
134/// 한글이면 `true`, 아니면 `false`
135#[inline]
136#[must_use]
137pub fn is_hangul(c: char) -> bool {
138    is_hangul_syllable(c) || is_jamo(c)
139}
140
141/// 주어진 문자가 한글 자모인지 확인합니다.
142///
143/// 호환용 자모(ㄱ-ㅎ, ㅏ-ㅣ) 범위를 확인합니다.
144#[inline]
145#[must_use]
146pub fn is_jamo(c: char) -> bool {
147    let code = c as u32;
148    // 호환용 자모: ㄱ(0x3131) ~ ㅣ(0x3163)
149    (0x3131..=0x3163).contains(&code)
150}
151
152/// 주어진 문자가 초성 자모인지 확인합니다.
153#[inline]
154#[must_use]
155pub fn is_choseong(c: char) -> bool {
156    CHOSEONG_LIST.contains(&c)
157}
158
159/// 주어진 문자가 중성 자모인지 확인합니다.
160#[inline]
161#[must_use]
162pub fn is_jungseong(c: char) -> bool {
163    JUNGSEONG_LIST.contains(&c)
164}
165
166/// 주어진 한글 음절에 종성이 있는지 확인합니다.
167///
168/// # Arguments
169///
170/// * `c` - 확인할 한글 음절
171///
172/// # Returns
173///
174/// - `Some(true)`: 종성이 있음
175/// - `Some(false)`: 종성이 없음
176/// - `None`: 한글 음절이 아님
177///
178/// # Example
179///
180/// ```rust
181/// use mecab_ko_hangul::has_jongseong;
182///
183/// assert_eq!(has_jongseong('한'), Some(true));
184/// assert_eq!(has_jongseong('하'), Some(false));
185/// assert_eq!(has_jongseong('a'), None);
186/// ```
187#[inline]
188#[must_use]
189pub fn has_jongseong(c: char) -> Option<bool> {
190    if !is_hangul_syllable(c) {
191        return None;
192    }
193    let code = c as u32 - HANGUL_BASE;
194    Some(code % JONGSEONG_COUNT != 0)
195}
196
197/// 한글 음절을 초성, 중성, 종성으로 분해합니다.
198///
199/// # Arguments
200///
201/// * `c` - 분해할 한글 음절
202///
203/// # Returns
204///
205/// - `Some((초성, 중성, Option<종성>))`: 분해 성공
206/// - `None`: 한글 음절이 아님
207///
208/// # Example
209///
210/// ```rust
211/// use mecab_ko_hangul::decompose;
212///
213/// let result = decompose('한');
214/// assert_eq!(result, Some(('ㅎ', 'ㅏ', Some('ㄴ'))));
215///
216/// let result = decompose('가');
217/// assert_eq!(result, Some(('ㄱ', 'ㅏ', None)));
218/// ```
219#[must_use]
220#[allow(clippy::similar_names)]
221pub fn decompose(c: char) -> Option<(char, char, Option<char>)> {
222    if !is_hangul_syllable(c) {
223        return None;
224    }
225
226    let code = c as u32 - HANGUL_BASE;
227
228    let jongseong_idx = code % JONGSEONG_COUNT;
229    let jungseong_idx = ((code - jongseong_idx) / JONGSEONG_COUNT) % JUNGSEONG_COUNT;
230    let choseong_idx = ((code - jongseong_idx) / JONGSEONG_COUNT) / JUNGSEONG_COUNT;
231
232    let cho = CHOSEONG_LIST[choseong_idx as usize];
233    let jung = JUNGSEONG_LIST[jungseong_idx as usize];
234    let jong = JONGSEONG_LIST[jongseong_idx as usize];
235
236    Some((cho, jung, jong))
237}
238
239/// 초성, 중성, 종성을 결합하여 한글 음절을 만듭니다.
240///
241/// # Arguments
242///
243/// * `cho` - 초성 자모
244/// * `jung` - 중성 자모
245/// * `jong` - 종성 자모 (없으면 `None`)
246///
247/// # Returns
248///
249/// - `Some(한글 음절)`: 결합 성공
250/// - `None`: 잘못된 자모
251///
252/// # Example
253///
254/// ```rust
255/// use mecab_ko_hangul::compose;
256///
257/// let c = compose('ㅎ', 'ㅏ', Some('ㄴ'));
258/// assert_eq!(c, Some('한'));
259///
260/// let c = compose('ㄱ', 'ㅏ', None);
261/// assert_eq!(c, Some('가'));
262/// ```
263#[must_use]
264#[allow(clippy::cast_possible_truncation, clippy::similar_names)]
265pub fn compose(choseong: char, jungseong: char, jongseong: Option<char>) -> Option<char> {
266    let choseong_idx = CHOSEONG_LIST.iter().position(|&c| c == choseong)? as u32;
267    let jungseong_idx = JUNGSEONG_LIST.iter().position(|&c| c == jungseong)? as u32;
268    let jongseong_idx = match jongseong {
269        None => 0,
270        Some(j) => JONGSEONG_LIST.iter().position(|&c| c == Some(j))? as u32,
271    };
272
273    let code = HANGUL_BASE
274        + (choseong_idx * JUNGSEONG_COUNT + jungseong_idx) * JONGSEONG_COUNT
275        + jongseong_idx;
276
277    char::from_u32(code)
278}
279
280/// 문자열의 모든 한글 음절을 자모로 분해합니다.
281///
282/// # Arguments
283///
284/// * `s` - 입력 문자열
285///
286/// # Returns
287///
288/// 자모로 분해된 문자열. 한글이 아닌 문자는 그대로 유지됩니다.
289///
290/// # Example
291///
292/// ```rust
293/// use mecab_ko_hangul::decompose_str;
294///
295/// assert_eq!(decompose_str("한글"), "ㅎㅏㄴㄱㅡㄹ");
296/// assert_eq!(decompose_str("Hello 한글"), "Hello ㅎㅏㄴㄱㅡㄹ");
297/// ```
298#[must_use]
299pub fn decompose_str(s: &str) -> String {
300    let mut result = String::with_capacity(s.len() * 3);
301
302    for c in s.chars() {
303        if let Some((cho, jung, jong)) = decompose(c) {
304            result.push(cho);
305            result.push(jung);
306            if let Some(j) = jong {
307                result.push(j);
308            }
309        } else {
310            result.push(c);
311        }
312    }
313
314    result
315}
316
317/// 자모 문자열을 한글 음절로 결합합니다.
318///
319/// # Arguments
320///
321/// * `s` - 자모 문자열
322///
323/// # Returns
324///
325/// 결합된 문자열. 결합이 불가능한 자모는 그대로 유지됩니다.
326///
327/// # Example
328///
329/// ```rust
330/// use mecab_ko_hangul::compose_str;
331///
332/// assert_eq!(compose_str("ㅎㅏㄴㄱㅡㄹ"), "한글");
333/// ```
334#[must_use]
335pub fn compose_str(s: &str) -> String {
336    let chars: Vec<char> = s.chars().collect();
337    let mut result = String::with_capacity(s.len());
338    let mut i = 0;
339
340    while i < chars.len() {
341        // 초성 + 중성 + (종성) 패턴 시도
342        if i + 1 < chars.len() && is_choseong(chars[i]) && is_jungseong(chars[i + 1]) {
343            let cho = chars[i];
344            let jung = chars[i + 1];
345
346            // 다음 문자가 종성이 될 수 있는지 확인
347            // 단, 그 다음에 중성이 오면 종성이 아님
348            let jongseong = if i + 2 < chars.len() {
349                let potential_jongseong = chars[i + 2];
350                let is_potential_jongseong = JONGSEONG_LIST.contains(&Some(potential_jongseong));
351
352                if is_potential_jongseong {
353                    // 다음 다음 문자가 중성이면, 현재 문자는 다음 음절의 초성
354                    if i + 3 < chars.len() && is_jungseong(chars[i + 3]) {
355                        None
356                    } else {
357                        Some(potential_jongseong)
358                    }
359                } else {
360                    None
361                }
362            } else {
363                None
364            };
365
366            if let Some(c) = compose(cho, jung, jongseong) {
367                result.push(c);
368                i += if jongseong.is_some() { 3 } else { 2 };
369            } else {
370                result.push(chars[i]);
371                i += 1;
372            }
373        } else {
374            result.push(chars[i]);
375            i += 1;
376        }
377    }
378
379    result
380}
381
382/// 문자의 종류를 나타내는 열거형
383#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
384pub enum CharType {
385    /// 한글 음절
386    HangulSyllable,
387    /// 한글 자모
388    HangulJamo,
389    /// 한자 (CJK Unified Ideographs)
390    Hanja,
391    /// 가타카나
392    Katakana,
393    /// 히라가나
394    Hiragana,
395    /// ASCII 알파벳
396    Alphabet,
397    /// 숫자
398    Digit,
399    /// 공백 문자
400    Whitespace,
401    /// 구두점
402    Punctuation,
403    /// 기타
404    Other,
405}
406
407/// 문자의 종류를 판별합니다.
408///
409/// # Arguments
410///
411/// * `c` - 판별할 문자
412///
413/// # Returns
414///
415/// 문자의 종류를 나타내는 `CharType`
416#[must_use]
417pub fn classify_char(c: char) -> CharType {
418    let code = c as u32;
419
420    if is_hangul_syllable(c) {
421        CharType::HangulSyllable
422    } else if is_jamo(c) {
423        CharType::HangulJamo
424    } else if (0x4E00..=0x9FFF).contains(&code) || (0x3400..=0x4DBF).contains(&code) {
425        CharType::Hanja
426    } else if (0x30A0..=0x30FF).contains(&code) {
427        CharType::Katakana
428    } else if (0x3040..=0x309F).contains(&code) {
429        CharType::Hiragana
430    } else if c.is_ascii_alphabetic() {
431        CharType::Alphabet
432    } else if c.is_ascii_digit() {
433        CharType::Digit
434    } else if c.is_whitespace() {
435        CharType::Whitespace
436    } else if c.is_ascii_punctuation() {
437        CharType::Punctuation
438    } else {
439        CharType::Other
440    }
441}
442
443#[cfg(test)]
444#[allow(clippy::similar_names, clippy::unwrap_used, clippy::expect_used)]
445mod tests {
446    use super::*;
447
448    #[test]
449    fn test_is_hangul_syllable() {
450        assert!(is_hangul_syllable('가'));
451        assert!(is_hangul_syllable('힣'));
452        assert!(is_hangul_syllable('한'));
453        assert!(!is_hangul_syllable('ㄱ'));
454        assert!(!is_hangul_syllable('a'));
455        assert!(!is_hangul_syllable('あ'));
456    }
457
458    #[test]
459    fn test_is_hangul() {
460        assert!(is_hangul('가'));
461        assert!(is_hangul('ㄱ'));
462        assert!(is_hangul('ㅏ'));
463        assert!(!is_hangul('a'));
464    }
465
466    #[test]
467    fn test_has_jongseong() {
468        assert_eq!(has_jongseong('한'), Some(true));
469        assert_eq!(has_jongseong('하'), Some(false));
470        assert_eq!(has_jongseong('글'), Some(true));
471        assert_eq!(has_jongseong('가'), Some(false));
472        assert_eq!(has_jongseong('a'), None);
473    }
474
475    #[test]
476    fn test_decompose() {
477        assert_eq!(decompose('가'), Some(('ㄱ', 'ㅏ', None)));
478        assert_eq!(decompose('한'), Some(('ㅎ', 'ㅏ', Some('ㄴ'))));
479        assert_eq!(decompose('글'), Some(('ㄱ', 'ㅡ', Some('ㄹ'))));
480        assert_eq!(decompose('힣'), Some(('ㅎ', 'ㅣ', Some('ㅎ'))));
481        assert_eq!(decompose('a'), None);
482    }
483
484    #[test]
485    fn test_compose() {
486        assert_eq!(compose('ㄱ', 'ㅏ', None), Some('가'));
487        assert_eq!(compose('ㅎ', 'ㅏ', Some('ㄴ')), Some('한'));
488        assert_eq!(compose('ㄱ', 'ㅡ', Some('ㄹ')), Some('글'));
489        assert_eq!(compose('ㅎ', 'ㅣ', Some('ㅎ')), Some('힣'));
490    }
491
492    #[test]
493    fn test_decompose_compose_roundtrip() {
494        let test_chars = ['가', '나', '다', '한', '글', '힣', '뷁'];
495        for c in test_chars {
496            let (cho, jung, jong) = decompose(c).unwrap();
497            let result = compose(cho, jung, jong).unwrap();
498            assert_eq!(c, result, "Roundtrip failed for '{c}'");
499        }
500    }
501
502    #[test]
503    fn test_decompose_str() {
504        assert_eq!(decompose_str("한글"), "ㅎㅏㄴㄱㅡㄹ");
505        assert_eq!(decompose_str("가나다"), "ㄱㅏㄴㅏㄷㅏ");
506        assert_eq!(decompose_str("Hello 한글"), "Hello ㅎㅏㄴㄱㅡㄹ");
507    }
508
509    #[test]
510    fn test_compose_str() {
511        assert_eq!(compose_str("ㅎㅏㄴㄱㅡㄹ"), "한글");
512        assert_eq!(compose_str("ㄱㅏㄴㅏㄷㅏ"), "가나다");
513    }
514
515    #[test]
516    fn test_classify_char() {
517        assert_eq!(classify_char('한'), CharType::HangulSyllable);
518        assert_eq!(classify_char('ㄱ'), CharType::HangulJamo);
519        assert_eq!(classify_char('韓'), CharType::Hanja);
520        assert_eq!(classify_char('ア'), CharType::Katakana);
521        assert_eq!(classify_char('あ'), CharType::Hiragana);
522        assert_eq!(classify_char('a'), CharType::Alphabet);
523        assert_eq!(classify_char('1'), CharType::Digit);
524        assert_eq!(classify_char(' '), CharType::Whitespace);
525        assert_eq!(classify_char('.'), CharType::Punctuation);
526    }
527
528    #[test]
529    fn test_is_jamo() {
530        // Test choseong jamo
531        assert!(is_jamo('ㄱ'));
532        assert!(is_jamo('ㄲ'));
533        assert!(is_jamo('ㅎ'));
534
535        // Test jungseong jamo
536        assert!(is_jamo('ㅏ'));
537        assert!(is_jamo('ㅐ'));
538        assert!(is_jamo('ㅣ'));
539
540        // Test complex jongseong jamo
541        assert!(is_jamo('ㄳ'));
542        assert!(is_jamo('ㄵ'));
543        assert!(is_jamo('ㅄ'));
544
545        // Non-jamo characters
546        assert!(!is_jamo('가'));
547        assert!(!is_jamo('a'));
548        assert!(!is_jamo('1'));
549        assert!(!is_jamo(' '));
550    }
551
552    #[test]
553    fn test_is_choseong() {
554        // All valid choseong
555        assert!(is_choseong('ㄱ'));
556        assert!(is_choseong('ㄲ'));
557        assert!(is_choseong('ㄴ'));
558        assert!(is_choseong('ㄷ'));
559        assert!(is_choseong('ㄸ'));
560        assert!(is_choseong('ㅎ'));
561
562        // Not choseong
563        assert!(!is_choseong('ㅏ')); // jungseong
564        assert!(!is_choseong('ㄳ')); // complex jongseong
565        assert!(!is_choseong('가')); // syllable
566        assert!(!is_choseong('a'));
567    }
568
569    #[test]
570    fn test_is_jungseong() {
571        // All valid jungseong
572        assert!(is_jungseong('ㅏ'));
573        assert!(is_jungseong('ㅐ'));
574        assert!(is_jungseong('ㅘ'));
575        assert!(is_jungseong('ㅣ'));
576
577        // Not jungseong
578        assert!(!is_jungseong('ㄱ')); // choseong
579        assert!(!is_jungseong('ㄳ')); // jongseong
580        assert!(!is_jungseong('가')); // syllable
581        assert!(!is_jungseong('a'));
582    }
583
584    #[test]
585    fn test_boundary_hangul_syllables() {
586        // First hangul syllable
587        assert!(is_hangul_syllable('가'));
588        assert_eq!(decompose('가'), Some(('ㄱ', 'ㅏ', None)));
589
590        // Last hangul syllable
591        assert!(is_hangul_syllable('힣'));
592        assert_eq!(decompose('힣'), Some(('ㅎ', 'ㅣ', Some('ㅎ'))));
593
594        // One before first (should not be hangul syllable)
595        let before_first = char::from_u32(0xAC00 - 1).unwrap();
596        assert!(!is_hangul_syllable(before_first));
597
598        // One after last (should not be hangul syllable)
599        let after_last = char::from_u32(0xD7A3 + 1).unwrap();
600        assert!(!is_hangul_syllable(after_last));
601    }
602
603    #[test]
604    fn test_compose_invalid_jamo() {
605        // Invalid choseong
606        assert_eq!(compose('ㅏ', 'ㅏ', None), None);
607
608        // Invalid jungseong
609        assert_eq!(compose('ㄱ', 'ㄱ', None), None);
610
611        // Invalid jongseong
612        assert_eq!(compose('ㄱ', 'ㅏ', Some('ㅏ')), None);
613
614        // Non-jamo characters
615        assert_eq!(compose('a', 'ㅏ', None), None);
616        assert_eq!(compose('ㄱ', 'b', None), None);
617        assert_eq!(compose('ㄱ', 'ㅏ', Some('c')), None);
618    }
619
620    #[test]
621    fn test_decompose_all_choseong() {
622        let expected_choseong = [
623            'ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ',
624            'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ',
625        ];
626
627        for &cho in &expected_choseong {
628            let syllable = compose(cho, 'ㅏ', None).unwrap();
629            let (decomposed_cho, _, _) = decompose(syllable).unwrap();
630            assert_eq!(cho, decomposed_cho, "Choseong mismatch for {syllable}");
631        }
632    }
633
634    #[test]
635    fn test_decompose_all_jungseong() {
636        let expected_jungseong = [
637            'ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ',
638            'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ',
639        ];
640
641        for &jung in &expected_jungseong {
642            let syllable = compose('ㄱ', jung, None).unwrap();
643            let (_, decomposed_jung, _) = decompose(syllable).unwrap();
644            assert_eq!(jung, decomposed_jung, "Jungseong mismatch for {syllable}");
645        }
646    }
647
648    #[test]
649    fn test_decompose_all_jongseong() {
650        let expected_jongseong = [
651            Some('ㄱ'),
652            Some('ㄲ'),
653            Some('ㄳ'),
654            Some('ㄴ'),
655            Some('ㄵ'),
656            Some('ㄶ'),
657            Some('ㄷ'),
658            Some('ㄹ'),
659            Some('ㄺ'),
660            Some('ㄻ'),
661            Some('ㄼ'),
662            Some('ㄽ'),
663            Some('ㄾ'),
664            Some('ㄿ'),
665            Some('ㅀ'),
666            Some('ㅁ'),
667            Some('ㅂ'),
668            Some('ㅄ'),
669            Some('ㅅ'),
670            Some('ㅆ'),
671            Some('ㅇ'),
672            Some('ㅈ'),
673            Some('ㅊ'),
674            Some('ㅋ'),
675            Some('ㅌ'),
676            Some('ㅍ'),
677            Some('ㅎ'),
678        ];
679
680        for &jong in &expected_jongseong {
681            let syllable = compose('ㄱ', 'ㅏ', jong).unwrap();
682            let (_, _, decomposed_jong) = decompose(syllable).unwrap();
683            assert_eq!(jong, decomposed_jong, "Jongseong mismatch for {syllable}");
684        }
685    }
686
687    #[test]
688    fn test_complex_jongseong() {
689        // Test double consonant jongseong
690        assert_eq!(compose('ㄱ', 'ㅏ', Some('ㄳ')), Some('갃'));
691        assert_eq!(decompose('갃'), Some(('ㄱ', 'ㅏ', Some('ㄳ'))));
692
693        assert_eq!(compose('ㄱ', 'ㅏ', Some('ㄵ')), Some('갅'));
694        assert_eq!(decompose('갅'), Some(('ㄱ', 'ㅏ', Some('ㄵ'))));
695
696        assert_eq!(compose('ㄱ', 'ㅏ', Some('ㅄ')), Some('값'));
697        assert_eq!(decompose('값'), Some(('ㄱ', 'ㅏ', Some('ㅄ'))));
698    }
699
700    #[test]
701    fn test_decompose_str_empty() {
702        assert_eq!(decompose_str(""), "");
703    }
704
705    #[test]
706    fn test_decompose_str_no_hangul() {
707        assert_eq!(decompose_str("Hello World"), "Hello World");
708        assert_eq!(decompose_str("123"), "123");
709        assert_eq!(decompose_str("!@#$%"), "!@#$%");
710    }
711
712    #[test]
713    fn test_decompose_str_mixed() {
714        assert_eq!(decompose_str("안녕하세요"), "ㅇㅏㄴㄴㅕㅇㅎㅏㅅㅔㅇㅛ");
715        assert_eq!(decompose_str("Hello 안녕"), "Hello ㅇㅏㄴㄴㅕㅇ");
716        assert_eq!(decompose_str("한글123"), "ㅎㅏㄴㄱㅡㄹ123");
717    }
718
719    #[test]
720    fn test_compose_str_empty() {
721        assert_eq!(compose_str(""), "");
722    }
723
724    #[test]
725    fn test_compose_str_no_jamo() {
726        assert_eq!(compose_str("Hello"), "Hello");
727        assert_eq!(compose_str("123"), "123");
728    }
729
730    #[test]
731    fn test_compose_str_incomplete_jamo() {
732        // Isolated choseong without jungseong
733        assert_eq!(compose_str("ㄱ"), "ㄱ");
734
735        // Isolated jungseong
736        assert_eq!(compose_str("ㅏ"), "ㅏ");
737
738        // Choseong + choseong (can't compose)
739        assert_eq!(compose_str("ㄱㄴ"), "ㄱㄴ");
740    }
741
742    #[test]
743    fn test_compose_str_with_jongseong_ambiguity() {
744        // ㄱㅏㄴㄴㅕㅇ should become 간녕 (not 간영)
745        // The ㄴ should be jongseong of first syllable, not choseong of second
746        assert_eq!(compose_str("ㄱㅏㄴㄴㅕㅇ"), "간녕");
747
748        // ㄱㅏㄹㄱㅏ should become 갈가 (not 가ㄹ가)
749        assert_eq!(compose_str("ㄱㅏㄹㄱㅏ"), "갈가");
750    }
751
752    #[test]
753    fn test_decompose_compose_str_roundtrip() {
754        let test_strings = ["한글", "안녕하세요", "대한민국", "가나다라마바사"];
755
756        for &s in &test_strings {
757            let decomposed = decompose_str(s);
758            let recomposed = compose_str(&decomposed);
759            assert_eq!(s, recomposed, "Roundtrip failed for '{s}'");
760        }
761    }
762
763    #[test]
764    fn test_has_jongseong_all_combinations() {
765        // Test syllable with no jongseong
766        assert_eq!(has_jongseong('가'), Some(false));
767        assert_eq!(has_jongseong('나'), Some(false));
768        assert_eq!(has_jongseong('다'), Some(false));
769
770        // Test syllable with jongseong
771        assert_eq!(has_jongseong('각'), Some(true));
772        assert_eq!(has_jongseong('난'), Some(true));
773        assert_eq!(has_jongseong('닭'), Some(true));
774
775        // Test with complex jongseong
776        assert_eq!(has_jongseong('갃'), Some(true)); // ㄳ
777        assert_eq!(has_jongseong('닭'), Some(true)); // ㄹ
778    }
779
780    #[test]
781    fn test_classify_char_edge_cases() {
782        // CJK Extension A
783        assert_eq!(classify_char('\u{3400}'), CharType::Hanja);
784        assert_eq!(classify_char('\u{4DBF}'), CharType::Hanja);
785
786        // CJK Unified Ideographs
787        assert_eq!(classify_char('\u{4E00}'), CharType::Hanja);
788        assert_eq!(classify_char('\u{9FFF}'), CharType::Hanja);
789
790        // Katakana range
791        assert_eq!(classify_char('\u{30A0}'), CharType::Katakana);
792        assert_eq!(classify_char('\u{30FF}'), CharType::Katakana);
793
794        // Hiragana range
795        assert_eq!(classify_char('\u{3040}'), CharType::Hiragana);
796        assert_eq!(classify_char('\u{309F}'), CharType::Hiragana);
797
798        // Other punctuation
799        assert_eq!(classify_char('!'), CharType::Punctuation);
800        assert_eq!(classify_char(','), CharType::Punctuation);
801        assert_eq!(classify_char(';'), CharType::Punctuation);
802
803        // Various whitespace
804        assert_eq!(classify_char('\t'), CharType::Whitespace);
805        assert_eq!(classify_char('\n'), CharType::Whitespace);
806
807        // Other characters
808        assert_eq!(classify_char('™'), CharType::Other);
809        assert_eq!(classify_char('€'), CharType::Other);
810    }
811
812    #[test]
813    fn test_all_hangul_syllables_decompose_compose() {
814        // Test a sample of syllables across the entire range
815        let sample_codes: Vec<u32> = vec![
816            0xAC00, // 가 (first)
817            0xAC01, // 각 (second)
818            0xB098, // 나
819            0xB2E4, // 다
820            0xB77C, // 라
821            0xB9C8, // 마
822            0xBC14, // 바
823            0xC0AC, // 사
824            0xC544, // 아
825            0xC790, // 자
826            0xCC28, // 차
827            0xCE74, // 카
828            0xD0C0, // 타
829            0xD30C, // 파
830            0xD558, // 하
831            0xD7A3, // 힣 (last)
832        ];
833
834        for code in sample_codes {
835            let c = char::from_u32(code).unwrap();
836            let (cho, jung, jong) = decompose(c).unwrap();
837            let recomposed = compose(cho, jung, jong).unwrap();
838            assert_eq!(c, recomposed, "Failed for U+{code:04X} ({c})");
839        }
840    }
841
842    #[test]
843    fn test_compose_str_real_words() {
844        assert_eq!(compose_str("ㅎㅏㄴㄱㅡㄹ"), "한글");
845        assert_eq!(compose_str("ㅇㅏㄴㄴㅕㅇㅎㅏㅅㅔㅇㅛ"), "안녕하세요");
846        assert_eq!(compose_str("ㄷㅐㅎㅏㄴㅁㅣㄴㄱㅜㄱ"), "대한민국");
847    }
848}