disarm 0.10.0

Unicode canonicalization and TR39 confusable analysis: building blocks for text-security pipelines (homoglyph/bidi/zalgo handling) plus standards-based transliteration
Documentation
//! Chinese Hanzi → Pinyin transliteration table.
//!
//! Context-free, single-reading mapping from the Unicode Unihan kMandarin field.
//! Covers the CJK Unified Ideographs block (U+4E00–U+9FFF).
//!
//! Pinyin is stripped of tone marks/numbers for ASCII output.
//! Where a character has multiple readings, the most common reading
//! (as determined by the Unicode Consortium) is used.
//!
//! Generated from Unicode Unihan database.
//! Character count: 20924
//!
//! PHF map generated by build.rs from src/tables/data/hanzi_pinyin.tsv.

// Hanzi → Pinyin (ASCII, no tones) — dense interned table over U+4E00–U+9FFF
// (#237 item 2): `HANZI_PINYIN_IDS[cp - BASE]` is a u16 id into `HANZI_PINYIN_VALUES`
// (id 0 = no entry). Flat index, no hashing; ~50 KB vs the former ~600 KB PHF.
include!(concat!(env!("OUT_DIR"), "/hanzi_pinyin_phf.rs"));

// Hanzi → Pinyin (toned, with diacritics) — O(1) lookup via compile-time perfect hash.
// Covers the most common ~2000 characters; falls through to toneless for the rest.
include!(concat!(env!("OUT_DIR"), "/hanzi_pinyin_toned_phf.rs"));

/// Look up pinyin for a Hanzi character.
#[inline]
pub fn lookup_hanzi(ch: char) -> Option<&'static str> {
    let off = usize::try_from((ch as u32).checked_sub(HANZI_PINYIN_BASE)?).ok()?;
    let id = *HANZI_PINYIN_IDS.get(off)?;
    // id 0 is the no-entry sentinel; any real entry indexes a non-empty value.
    (id != 0).then(|| HANZI_PINYIN_VALUES[id as usize])
}

/// Look up toned pinyin for a Hanzi character.
/// Returns diacritical pinyin (e.g., "běi") if available, otherwise falls through
/// to toneless pinyin (e.g., "bei").
#[inline]
pub fn lookup_hanzi_toned(ch: char) -> Option<&'static str> {
    HANZI_PINYIN_TONED
        .get(&ch)
        .copied()
        .or_else(|| lookup_hanzi(ch))
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_lookup_basic() {
        assert_eq!(lookup_hanzi('\u{4E00}'), Some("yi")); //        assert_eq!(lookup_hanzi('\u{4EAC}'), Some("jing")); //    }

    #[test]
    fn test_lookup_not_found() {
        assert!(lookup_hanzi('a').is_none());
    }
}