ens-normalize 0.1.1

Rust port of adraffy's ENS normalizer
Documentation
use crate::intmap::{IntMap, IntSet};
use serde::Deserialize;
use std::sync::LazyLock;

const S0: u32 = 0xAC00;
const L0: u32 = 0x1100;
const V0: u32 = 0x1161;
const T0: u32 = 0x11A7;
const L_COUNT: u32 = 19;
const V_COUNT: u32 = 21;
const T_COUNT: u32 = 28;
const N_COUNT: u32 = V_COUNT * T_COUNT;
const S_COUNT: u32 = L_COUNT * N_COUNT;
const S1: u32 = S0 + S_COUNT;
const L1: u32 = L0 + L_COUNT;
const V1: u32 = V0 + V_COUNT;
const T1: u32 = T0 + T_COUNT;

#[derive(Deserialize)]
struct RawNf {
    ranks: Vec<Vec<u32>>,
    exclusions: Vec<u32>,
    decomp: Vec<(u32, Vec<u32>)>,
}

struct NfData {
    shifted_rank: IntMap<u32, u32>,
    decomp: IntMap<u32, Vec<u32>>,
    recomp: IntMap<u32, IntMap<u32, u32>>,
}

static NF: LazyLock<NfData> = LazyLock::new(|| {
    let raw: RawNf = serde_json::from_str(include_str!("../data/nf.json")).expect("valid nf.json");
    let mut shifted_rank = IntMap::default();
    for (i, cps) in raw.ranks.iter().enumerate() {
        let rank = ((i as u32) + 1) << 24;
        for &cp in cps {
            shifted_rank.insert(cp, rank);
        }
    }

    let exclusions: IntSet<u32> = raw.exclusions.into_iter().collect();
    let mut decomp = IntMap::default();
    let mut recomp: IntMap<u32, IntMap<u32, u32>> = IntMap::default();
    for (cp, mut cps) in raw.decomp {
        if !exclusions.contains(&cp) && cps.len() == 2 {
            recomp.entry(cps[0]).or_default().insert(cps[1], cp);
        }
        cps.reverse();
        decomp.insert(cp, cps);
    }

    NfData {
        shifted_rank,
        decomp,
        recomp,
    }
});

fn unpack_cc(packed: u32) -> u32 {
    (packed >> 24) & 0xFF
}

fn unpack_cp(packed: u32) -> u32 {
    packed & 0xFF_FFFF
}

fn is_hangul(cp: u32) -> bool {
    (S0..S1).contains(&cp)
}

fn compose_pair(a: u32, b: u32) -> Option<u32> {
    if (L0..L1).contains(&a) && (V0..V1).contains(&b) {
        Some(S0 + (a - L0) * N_COUNT + (b - V0) * T_COUNT)
    } else if is_hangul(a) && b > T0 && b < T1 && (a - S0).is_multiple_of(T_COUNT) {
        Some(a + (b - T0))
    } else {
        NF.recomp.get(&a).and_then(|bucket| bucket.get(&b)).copied()
    }
}

fn decomposed(cps: &[u32]) -> Vec<u32> {
    let mut ret = Vec::new();
    let mut buf = Vec::new();
    let mut check_order = false;

    let add = |ret: &mut Vec<u32>, check_order: &mut bool, cp: u32| {
        if let Some(&cc) = NF.shifted_rank.get(&cp) {
            *check_order = true;
            ret.push(cp | cc);
        } else {
            ret.push(cp);
        }
    };

    for &cp0 in cps {
        let mut cp = cp0;
        loop {
            if cp < 0x80 {
                ret.push(cp);
            } else if is_hangul(cp) {
                let s_index = cp - S0;
                let l_index = s_index / N_COUNT;
                let v_index = (s_index % N_COUNT) / T_COUNT;
                let t_index = s_index % T_COUNT;
                add(&mut ret, &mut check_order, L0 + l_index);
                add(&mut ret, &mut check_order, V0 + v_index);
                if t_index > 0 {
                    add(&mut ret, &mut check_order, T0 + t_index);
                }
            } else if let Some(mapped) = NF.decomp.get(&cp) {
                buf.extend_from_slice(mapped);
            } else {
                add(&mut ret, &mut check_order, cp);
            }

            if let Some(next) = buf.pop() {
                cp = next;
            } else {
                break;
            }
        }
    }

    if check_order && ret.len() > 1 {
        let mut prev_cc = unpack_cc(ret[0]);
        let mut i = 1;
        while i < ret.len() {
            let cc = unpack_cc(ret[i]);
            if cc == 0 || prev_cc <= cc {
                prev_cc = cc;
                i += 1;
                continue;
            }
            let mut j = i - 1;
            loop {
                ret.swap(j + 1, j);
                if j == 0 {
                    break;
                }
                j -= 1;
                prev_cc = unpack_cc(ret[j]);
                if prev_cc <= cc {
                    break;
                }
            }
            prev_cc = unpack_cc(ret[i]);
            i += 1;
        }
    }

    ret
}

fn composed_from_decomposed(v: &[u32]) -> Vec<u32> {
    let mut ret = Vec::new();
    let mut stack = Vec::new();
    let mut prev_cp: Option<u32> = None;
    let mut prev_cc = 0;

    for &packed in v {
        let cc = unpack_cc(packed);
        let cp = unpack_cp(packed);
        if let Some(prev) = prev_cp {
            if prev_cc > 0 && prev_cc >= cc {
                if cc == 0 {
                    ret.push(prev);
                    ret.append(&mut stack);
                    prev_cp = Some(cp);
                } else {
                    stack.push(cp);
                }
                prev_cc = cc;
            } else if let Some(composed) = compose_pair(prev, cp) {
                prev_cp = Some(composed);
            } else if prev_cc == 0 && cc == 0 {
                ret.push(prev);
                prev_cp = Some(cp);
            } else {
                stack.push(cp);
                prev_cc = cc;
            }
        } else if cc == 0 {
            prev_cp = Some(cp);
        } else {
            ret.push(cp);
        }
    }

    if let Some(prev) = prev_cp {
        ret.push(prev);
        ret.append(&mut stack);
    }

    ret
}

pub fn nfd(cps: &[u32]) -> Vec<u32> {
    decomposed(cps).into_iter().map(unpack_cp).collect()
}

pub fn nfc(cps: &[u32]) -> Vec<u32> {
    composed_from_decomposed(&decomposed(cps))
}