Skip to main content

ens_normalize/
nf.rs

1use serde::Deserialize;
2use std::collections::{HashMap, HashSet};
3use std::sync::LazyLock;
4
5const S0: u32 = 0xAC00;
6const L0: u32 = 0x1100;
7const V0: u32 = 0x1161;
8const T0: u32 = 0x11A7;
9const L_COUNT: u32 = 19;
10const V_COUNT: u32 = 21;
11const T_COUNT: u32 = 28;
12const N_COUNT: u32 = V_COUNT * T_COUNT;
13const S_COUNT: u32 = L_COUNT * N_COUNT;
14const S1: u32 = S0 + S_COUNT;
15const L1: u32 = L0 + L_COUNT;
16const V1: u32 = V0 + V_COUNT;
17const T1: u32 = T0 + T_COUNT;
18
19#[derive(Deserialize)]
20struct RawNf {
21    ranks: Vec<Vec<u32>>,
22    exclusions: Vec<u32>,
23    decomp: Vec<(u32, Vec<u32>)>,
24}
25
26struct NfData {
27    shifted_rank: HashMap<u32, u32>,
28    decomp: HashMap<u32, Vec<u32>>,
29    recomp: HashMap<u32, HashMap<u32, u32>>,
30}
31
32static NF: LazyLock<NfData> = LazyLock::new(|| {
33    let raw: RawNf = serde_json::from_str(include_str!("../data/nf.json")).expect("valid nf.json");
34    let mut shifted_rank = HashMap::new();
35    for (i, cps) in raw.ranks.iter().enumerate() {
36        let rank = ((i as u32) + 1) << 24;
37        for &cp in cps {
38            shifted_rank.insert(cp, rank);
39        }
40    }
41
42    let exclusions: HashSet<u32> = raw.exclusions.into_iter().collect();
43    let mut decomp = HashMap::new();
44    let mut recomp: HashMap<u32, HashMap<u32, u32>> = HashMap::new();
45    for (cp, mut cps) in raw.decomp {
46        if !exclusions.contains(&cp) && cps.len() == 2 {
47            recomp.entry(cps[0]).or_default().insert(cps[1], cp);
48        }
49        cps.reverse();
50        decomp.insert(cp, cps);
51    }
52
53    NfData {
54        shifted_rank,
55        decomp,
56        recomp,
57    }
58});
59
60fn unpack_cc(packed: u32) -> u32 {
61    (packed >> 24) & 0xFF
62}
63
64fn unpack_cp(packed: u32) -> u32 {
65    packed & 0xFF_FFFF
66}
67
68fn is_hangul(cp: u32) -> bool {
69    (S0..S1).contains(&cp)
70}
71
72fn compose_pair(a: u32, b: u32) -> Option<u32> {
73    if (L0..L1).contains(&a) && (V0..V1).contains(&b) {
74        Some(S0 + (a - L0) * N_COUNT + (b - V0) * T_COUNT)
75    } else if is_hangul(a) && b > T0 && b < T1 && (a - S0).is_multiple_of(T_COUNT) {
76        Some(a + (b - T0))
77    } else {
78        NF.recomp.get(&a).and_then(|bucket| bucket.get(&b)).copied()
79    }
80}
81
82fn decomposed(cps: &[u32]) -> Vec<u32> {
83    let mut ret = Vec::new();
84    let mut buf = Vec::new();
85    let mut check_order = false;
86
87    let add = |ret: &mut Vec<u32>, check_order: &mut bool, cp: u32| {
88        if let Some(&cc) = NF.shifted_rank.get(&cp) {
89            *check_order = true;
90            ret.push(cp | cc);
91        } else {
92            ret.push(cp);
93        }
94    };
95
96    for &cp0 in cps {
97        let mut cp = cp0;
98        loop {
99            if cp < 0x80 {
100                ret.push(cp);
101            } else if is_hangul(cp) {
102                let s_index = cp - S0;
103                let l_index = s_index / N_COUNT;
104                let v_index = (s_index % N_COUNT) / T_COUNT;
105                let t_index = s_index % T_COUNT;
106                add(&mut ret, &mut check_order, L0 + l_index);
107                add(&mut ret, &mut check_order, V0 + v_index);
108                if t_index > 0 {
109                    add(&mut ret, &mut check_order, T0 + t_index);
110                }
111            } else if let Some(mapped) = NF.decomp.get(&cp) {
112                buf.extend_from_slice(mapped);
113            } else {
114                add(&mut ret, &mut check_order, cp);
115            }
116
117            if let Some(next) = buf.pop() {
118                cp = next;
119            } else {
120                break;
121            }
122        }
123    }
124
125    if check_order && ret.len() > 1 {
126        let mut prev_cc = unpack_cc(ret[0]);
127        let mut i = 1;
128        while i < ret.len() {
129            let cc = unpack_cc(ret[i]);
130            if cc == 0 || prev_cc <= cc {
131                prev_cc = cc;
132                i += 1;
133                continue;
134            }
135            let mut j = i - 1;
136            loop {
137                ret.swap(j + 1, j);
138                if j == 0 {
139                    break;
140                }
141                j -= 1;
142                prev_cc = unpack_cc(ret[j]);
143                if prev_cc <= cc {
144                    break;
145                }
146            }
147            prev_cc = unpack_cc(ret[i]);
148            i += 1;
149        }
150    }
151
152    ret
153}
154
155fn composed_from_decomposed(v: &[u32]) -> Vec<u32> {
156    let mut ret = Vec::new();
157    let mut stack = Vec::new();
158    let mut prev_cp: Option<u32> = None;
159    let mut prev_cc = 0;
160
161    for &packed in v {
162        let cc = unpack_cc(packed);
163        let cp = unpack_cp(packed);
164        if let Some(prev) = prev_cp {
165            if prev_cc > 0 && prev_cc >= cc {
166                if cc == 0 {
167                    ret.push(prev);
168                    ret.append(&mut stack);
169                    prev_cp = Some(cp);
170                } else {
171                    stack.push(cp);
172                }
173                prev_cc = cc;
174            } else if let Some(composed) = compose_pair(prev, cp) {
175                prev_cp = Some(composed);
176            } else if prev_cc == 0 && cc == 0 {
177                ret.push(prev);
178                prev_cp = Some(cp);
179            } else {
180                stack.push(cp);
181                prev_cc = cc;
182            }
183        } else if cc == 0 {
184            prev_cp = Some(cp);
185        } else {
186            ret.push(cp);
187        }
188    }
189
190    if let Some(prev) = prev_cp {
191        ret.push(prev);
192        ret.append(&mut stack);
193    }
194
195    ret
196}
197
198pub fn nfd(cps: &[u32]) -> Vec<u32> {
199    decomposed(cps).into_iter().map(unpack_cp).collect()
200}
201
202pub fn nfc(cps: &[u32]) -> Vec<u32> {
203    composed_from_decomposed(&decomposed(cps))
204}