1use super::normalization_data::{
37 CCC_TABLE, COMPOSITION_PAIRS, NFD_INDEX, NFD_POOL, NFKD_INDEX, NFKD_POOL,
38};
39
40pub const HANGUL_S_BASE: u32 = 0xAC00;
43pub const HANGUL_L_BASE: u32 = 0x1100;
45pub const HANGUL_V_BASE: u32 = 0x1161;
47pub const HANGUL_T_BASE: u32 = 0x11A7;
50pub const HANGUL_L_COUNT: u32 = 19;
52pub const HANGUL_V_COUNT: u32 = 21;
54pub const HANGUL_T_COUNT: u32 = 28;
56pub const HANGUL_N_COUNT: u32 = HANGUL_V_COUNT * HANGUL_T_COUNT; pub const HANGUL_S_COUNT: u32 = HANGUL_L_COUNT * HANGUL_N_COUNT; #[inline]
65pub fn ccc(cp: u32) -> u8 {
66 match CCC_TABLE.binary_search_by_key(&cp, |entry| entry.0) {
67 Ok(idx) => CCC_TABLE[idx].1,
68 Err(_) => 0,
69 }
70}
71
72#[inline]
76pub fn nfd_lookup(cp: u32) -> Option<&'static [u32]> {
77 let idx = NFD_INDEX.binary_search_by_key(&cp, |entry| entry.0).ok()?;
78 let (_, off, len) = NFD_INDEX[idx];
79 let start = off as usize;
80 let end = start + len as usize;
81 Some(&NFD_POOL[start..end])
82}
83
84#[inline]
88pub fn nfkd_lookup(cp: u32) -> Option<&'static [u32]> {
89 let idx = NFKD_INDEX.binary_search_by_key(&cp, |entry| entry.0).ok()?;
90 let (_, off, len) = NFKD_INDEX[idx];
91 let start = off as usize;
92 let end = start + len as usize;
93 Some(&NFKD_POOL[start..end])
94}
95
96#[inline]
101pub fn compose_pair(first: u32, second: u32) -> Option<u32> {
102 let idx = COMPOSITION_PAIRS
103 .binary_search_by(|entry| (entry.0, entry.1).cmp(&(first, second)))
104 .ok()?;
105 Some(COMPOSITION_PAIRS[idx].2)
106}
107
108#[inline]
112pub fn hangul_decompose_into(cp: u32, out: &mut Vec<u32>) -> bool {
113 if !(HANGUL_S_BASE..HANGUL_S_BASE + HANGUL_S_COUNT).contains(&cp) {
114 return false;
115 }
116 let s_index = cp - HANGUL_S_BASE;
117 let l = HANGUL_L_BASE + s_index / HANGUL_N_COUNT;
118 let v = HANGUL_V_BASE + (s_index % HANGUL_N_COUNT) / HANGUL_T_COUNT;
119 let t_offset = s_index % HANGUL_T_COUNT;
120 out.push(l);
121 out.push(v);
122 if t_offset != 0 {
123 out.push(HANGUL_T_BASE + t_offset);
124 }
125 true
126}
127
128#[inline]
132pub fn hangul_compose(first: u32, second: u32) -> Option<u32> {
133 if (HANGUL_L_BASE..HANGUL_L_BASE + HANGUL_L_COUNT).contains(&first)
135 && (HANGUL_V_BASE..HANGUL_V_BASE + HANGUL_V_COUNT).contains(&second)
136 {
137 let l_index = first - HANGUL_L_BASE;
138 let v_index = second - HANGUL_V_BASE;
139 return Some(HANGUL_S_BASE + (l_index * HANGUL_V_COUNT + v_index) * HANGUL_T_COUNT);
140 }
141 if (HANGUL_S_BASE..HANGUL_S_BASE + HANGUL_S_COUNT).contains(&first) {
145 let s_index = first - HANGUL_S_BASE;
146 if s_index.is_multiple_of(HANGUL_T_COUNT)
147 && (HANGUL_T_BASE + 1..HANGUL_T_BASE + HANGUL_T_COUNT).contains(&second)
148 {
149 return Some(first + (second - HANGUL_T_BASE));
150 }
151 }
152 None
153}
154
155#[derive(Clone, Copy, Debug, PartialEq, Eq)]
157pub enum DecompKind {
158 Canonical,
160 Compatibility,
162}
163
164pub fn decompose_to_buffer(input: &str, kind: DecompKind, out: &mut Vec<u32>) {
169 out.reserve(input.len() * 2);
174 for ch in input.chars() {
175 let cp = ch as u32;
176 if hangul_decompose_into(cp, out) {
177 continue;
178 }
179 let mapping = match kind {
180 DecompKind::Canonical => nfd_lookup(cp),
181 DecompKind::Compatibility => nfkd_lookup(cp),
182 };
183 match mapping {
184 Some(slice) => out.extend_from_slice(slice),
185 None => out.push(cp),
186 }
187 }
188}
189
190pub fn canonical_reorder(buf: &mut [u32]) {
194 let len = buf.len();
195 let mut i = 0;
196 while i < len {
197 if ccc(buf[i]) == 0 {
198 i += 1;
199 continue;
200 }
201 let start = i;
202 while i < len && ccc(buf[i]) != 0 {
203 i += 1;
204 }
205 buf[start..i].sort_by_key(|&cp| ccc(cp));
209 }
210}
211
212pub fn decompose_and_reorder(input: &str, kind: DecompKind) -> Vec<u32> {
214 let mut buf = Vec::with_capacity(input.len() + 4);
215 decompose_to_buffer(input, kind, &mut buf);
216 canonical_reorder(&mut buf);
217 buf
218}
219
220pub fn encode(cps: &[u32]) -> String {
225 let mut out = String::with_capacity(cps.len());
226 for &cp in cps {
227 if let Some(c) = char::from_u32(cp) {
228 out.push(c);
229 }
230 }
231 out
232}
233
234pub fn to_nfd(input: &str) -> String {
236 encode(&decompose_and_reorder(input, DecompKind::Canonical))
237}
238
239pub fn to_nfkd(input: &str) -> String {
241 encode(&decompose_and_reorder(input, DecompKind::Compatibility))
242}
243
244pub fn compose(buf: Vec<u32>) -> Vec<u32> {
247 if buf.is_empty() {
248 return buf;
249 }
250 let mut out: Vec<u32> = Vec::with_capacity(buf.len());
251 let mut last_starter: usize = usize::MAX;
254 let mut last_ccc: u8 = 0;
256
257 for cp in buf {
258 let cur_ccc = ccc(cp);
259 if last_starter != usize::MAX {
260 let starter_cp = out[last_starter];
261 let composed = hangul_compose(starter_cp, cp).or_else(|| compose_pair(starter_cp, cp));
264 if let Some(comp) = composed {
265 let blocked = cur_ccc != 0 && last_ccc >= cur_ccc;
271 if !blocked {
272 out[last_starter] = comp;
273 continue;
274 }
275 }
276 }
277 out.push(cp);
278 if cur_ccc == 0 {
279 last_starter = out.len() - 1;
280 last_ccc = 0;
281 } else {
282 last_ccc = cur_ccc;
283 }
284 }
285 out
286}
287
288pub fn to_nfc(input: &str) -> String {
290 let decomposed = decompose_and_reorder(input, DecompKind::Canonical);
291 encode(&compose(decomposed))
292}
293
294pub fn to_nfkc(input: &str) -> String {
296 let decomposed = decompose_and_reorder(input, DecompKind::Compatibility);
297 encode(&compose(decomposed))
298}
299
300pub fn encode_decomp_table_bytes(index: &[(u32, u32, u8)], pool: &[u32]) -> Vec<u8> {
321 let mut bytes = Vec::with_capacity(4 + index.len() * 12 + 4 + pool.len() * 4);
322 bytes.extend_from_slice(&(index.len() as u32).to_le_bytes());
323 for (cp, off, len) in index {
324 bytes.extend_from_slice(&cp.to_le_bytes());
325 bytes.extend_from_slice(&off.to_le_bytes());
326 bytes.extend_from_slice(&u32::from(*len).to_le_bytes());
327 }
328 bytes.extend_from_slice(&(pool.len() as u32).to_le_bytes());
329 for cp in pool {
330 bytes.extend_from_slice(&cp.to_le_bytes());
331 }
332 bytes
333}
334
335pub fn encode_ccc_table_bytes(table: &[(u32, u8)]) -> Vec<u8> {
344 let mut bytes = Vec::with_capacity(4 + table.len() * 8);
345 bytes.extend_from_slice(&(table.len() as u32).to_le_bytes());
346 for (cp, ccc) in table {
347 bytes.extend_from_slice(&cp.to_le_bytes());
348 bytes.extend_from_slice(&u32::from(*ccc).to_le_bytes());
349 }
350 bytes
351}
352
353pub fn encode_composition_table_bytes(table: &[(u32, u32, u32)]) -> Vec<u8> {
360 let mut bytes = Vec::with_capacity(4 + table.len() * 12);
361 bytes.extend_from_slice(&(table.len() as u32).to_le_bytes());
362 for (first, second, composed) in table {
363 bytes.extend_from_slice(&first.to_le_bytes());
364 bytes.extend_from_slice(&second.to_le_bytes());
365 bytes.extend_from_slice(&composed.to_le_bytes());
366 }
367 bytes
368}
369
370#[cfg(test)]
371mod tests {
372 use super::*;
373
374 #[test]
375 fn ascii_roundtrips_unchanged() {
376 for s in ["", "hello", "ABC 123", "the quick brown fox"] {
377 assert_eq!(to_nfc(s), s);
378 assert_eq!(to_nfd(s), s);
379 assert_eq!(to_nfkc(s), s);
380 assert_eq!(to_nfkd(s), s);
381 }
382 }
383
384 #[test]
385 fn nfc_composes_combining_acute() {
386 let decomposed = "cafe\u{0301}";
389 let composed = "caf\u{00E9}";
390 assert_eq!(to_nfc(decomposed), composed);
391 assert_eq!(to_nfc(composed), composed);
392 }
393
394 #[test]
395 fn nfd_decomposes_precomposed_acute() {
396 let composed = "caf\u{00E9}";
397 let decomposed = "cafe\u{0301}";
398 assert_eq!(to_nfd(composed), decomposed);
399 assert_eq!(to_nfd(decomposed), decomposed);
400 }
401
402 #[test]
403 fn hangul_nfd_uses_algorithmic_decomposition() {
404 let composed = "\u{D55C}";
406 let decomposed = "\u{1112}\u{1161}\u{11AB}";
407 assert_eq!(to_nfd(composed), decomposed);
408 }
409
410 #[test]
411 fn hangul_nfc_recomposes_jamos() {
412 let composed = "\u{D55C}";
413 let decomposed = "\u{1112}\u{1161}\u{11AB}";
414 assert_eq!(to_nfc(decomposed), composed);
415 }
416
417 #[test]
418 fn nfkd_expands_compatibility_form() {
419 let input = "\u{00BD}";
421 let expected = "1\u{2044}2";
422 assert_eq!(to_nfkd(input), expected);
423 assert_eq!(to_nfd(input), input);
425 }
426
427 #[test]
428 fn nfkc_does_not_recompose_compatibility_fraction() {
429 assert_eq!(to_nfkc("\u{00BD}"), "1\u{2044}2");
430 }
431
432 #[test]
433 fn canonical_reorder_sorts_combining_marks_by_ccc() {
434 let input = "a\u{0307}\u{0323}";
437 let expected = "a\u{0323}\u{0307}";
438 assert_eq!(to_nfd(input), expected);
439 assert_eq!(to_nfd(expected), expected);
440 }
441
442 #[test]
443 fn nfc_idempotence() {
444 for s in [
445 "",
446 "caf\u{00E9}",
447 "\u{D55C}\u{AD6D}\u{C5B4}",
448 "1\u{2044}2",
449 "a\u{0307}\u{0323}b",
450 ] {
451 let once = to_nfc(s);
452 assert_eq!(to_nfc(&once), once, "NFC idempotence fail on {s:?}");
453 }
454 }
455
456 #[test]
457 fn nfd_idempotence() {
458 for s in [
459 "",
460 "caf\u{00E9}",
461 "\u{D55C}\u{AD6D}\u{C5B4}",
462 "a\u{0307}\u{0323}b",
463 ] {
464 let once = to_nfd(s);
465 assert_eq!(to_nfd(&once), once, "NFD idempotence fail on {s:?}");
466 }
467 }
468
469 #[test]
470 fn nfkc_idempotence() {
471 for s in [
472 "",
473 "caf\u{00E9}",
474 "\u{D55C}\u{AD6D}\u{C5B4}",
475 "\u{00BD}",
476 "\u{FB01}le",
477 ] {
478 let once = to_nfkc(s);
479 assert_eq!(to_nfkc(&once), once, "NFKC idempotence fail on {s:?}");
480 }
481 }
482
483 #[test]
484 fn nfkd_idempotence() {
485 for s in [
486 "",
487 "caf\u{00E9}",
488 "\u{D55C}\u{AD6D}\u{C5B4}",
489 "\u{00BD}",
490 "\u{FB01}le",
491 ] {
492 let once = to_nfkd(s);
493 assert_eq!(to_nfkd(&once), once, "NFKD idempotence fail on {s:?}");
494 }
495 }
496
497 #[test]
498 fn nfc_skips_full_composition_exclusion() {
499 assert_eq!(to_nfc("K"), "K");
504 assert_eq!(to_nfc("\u{212A}"), "K");
505 }
506
507 #[test]
508 fn nfd_decomposes_kelvin_to_ascii_k() {
509 assert_eq!(to_nfd("\u{212A}"), "K");
510 }
511
512 #[test]
513 fn ligature_nfkc_splits_into_components() {
514 assert_eq!(to_nfkd("\u{FB01}"), "fi");
515 assert_eq!(to_nfkc("\u{FB01}"), "fi");
516 }
517
518 #[test]
519 fn nfc_starter_blocking_prevents_invalid_composition() {
520 assert_eq!(to_nfc("a\u{0308}\u{0301}"), "\u{00E4}\u{0301}");
525 }
526
527 #[test]
528 fn encode_decomp_table_layout() {
529 let index: &[(u32, u32, u8)] = &[(0x00C0, 0, 2), (0x00C1, 2, 2)];
530 let pool: &[u32] = &[0x0041, 0x0300, 0x0041, 0x0301];
531 let bytes = encode_decomp_table_bytes(index, pool);
532 assert_eq!(bytes.len(), 4 + 2 * 12 + 4 + 4 * 4);
533 assert_eq!(&bytes[0..4], &2u32.to_le_bytes());
534 assert_eq!(&bytes[4..8], &0x00C0u32.to_le_bytes());
535 assert_eq!(&bytes[8..12], &0u32.to_le_bytes());
536 assert_eq!(&bytes[12..16], &2u32.to_le_bytes());
537 assert_eq!(&bytes[28..32], &4u32.to_le_bytes());
539 assert_eq!(&bytes[32..36], &0x0041u32.to_le_bytes());
540 }
541
542 #[test]
543 fn encode_ccc_table_layout() {
544 let table: &[(u32, u8)] = &[(0x0300, 230), (0x0301, 230)];
545 let bytes = encode_ccc_table_bytes(table);
546 assert_eq!(bytes.len(), 4 + 2 * 8);
547 assert_eq!(&bytes[0..4], &2u32.to_le_bytes());
548 assert_eq!(&bytes[4..8], &0x0300u32.to_le_bytes());
549 assert_eq!(&bytes[8..12], &230u32.to_le_bytes());
550 }
551
552 #[test]
553 fn encode_composition_table_layout() {
554 let table: &[(u32, u32, u32)] = &[(0x0041, 0x0300, 0x00C0)];
555 let bytes = encode_composition_table_bytes(table);
556 assert_eq!(bytes.len(), 4 + 12);
557 assert_eq!(&bytes[0..4], &1u32.to_le_bytes());
558 assert_eq!(&bytes[4..8], &0x0041u32.to_le_bytes());
559 assert_eq!(&bytes[8..12], &0x0300u32.to_le_bytes());
560 assert_eq!(&bytes[12..16], &0x00C0u32.to_le_bytes());
561 }
562
563 #[test]
564 fn ccc_table_contains_combining_acute() {
565 assert_eq!(ccc(0x0301), 230);
566 assert_eq!(ccc(0x0041), 0);
567 }
568
569 #[test]
570 fn composition_table_sorted_and_excludes_kelvin() {
571 for w in COMPOSITION_PAIRS.windows(2) {
573 let a = (w[0].0, w[0].1);
574 let b = (w[1].0, w[1].1);
575 assert!(a < b, "COMPOSITION_PAIRS must be sorted: {a:?} >= {b:?}");
576 }
577 let kelvin_idx = COMPOSITION_PAIRS.binary_search_by(|t| (t.0, t.1).cmp(&(0x004B, 0)));
579 assert!(
580 kelvin_idx.is_err(),
581 "U+212A should be excluded from COMPOSITION_PAIRS"
582 );
583 }
584}