1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
//! Unicode to ASCII transliteration.
//!
//! Converts Unicode text to a reasonable representation using only ASCII.
//!
//! For most characters in Unicode, Any-Ascii provides an ASCII-only replacement string.
//! Text is converted character-by-character without considering the context.
//! The mappings for each language are based on popular existing romanization schemes.
//! Symbolic characters are converted based on their meaning or appearance.
//! All ASCII characters in the input are left unchanged,
//! every other character is replaced with printable ASCII characters.
//! Unknown characters are removed.
mod block;

/// Transliterates a Unicode String into ASCII.
///
/// ```
/// # use any_ascii::any_ascii;
/// assert_eq!("anthropoi", any_ascii("άνθρωποι"));
/// assert_eq!("sample", any_ascii("sample"));
/// assert_eq!("ShenZhen", any_ascii("深圳"));
/// assert_eq!("Boris", any_ascii("Борис"));
/// assert_eq!("toyota", any_ascii("トヨタ"));
/// ```
pub fn any_ascii(s: &str) -> String {
    s.chars().map(any_ascii_char).collect()
}

/// Transliterates a Unicode char into ASCII.
///
/// ```
/// # use any_ascii::any_ascii_char;
/// assert_eq!("ae", any_ascii_char('æ'));
/// assert_eq!("e", any_ascii_char('é'));
/// assert_eq!("k", any_ascii_char('k'));
/// assert_eq!("ss", any_ascii_char('ß'));
/// assert_eq!("Shen", any_ascii_char('深'));
/// assert_eq!("c", any_ascii_char('ç'));
/// assert_eq!("l", any_ascii_char('λ'));
/// assert_eq!("zh", any_ascii_char('ж'));
/// assert_eq!(":crown:", any_ascii_char('👑'));
/// assert_eq!("#", any_ascii_char('♯'));
/// ```
pub fn any_ascii_char(c: char) -> &'static str {
    let block_num = ((c as u32) >> 8) as u16;
    let block_bytes = block::block(block_num);
    let block: &'static [[u8; 3]] = unsafe {
        std::slice::from_raw_parts(
            block_bytes.as_ptr() as *const [u8; 3],
            block_bytes.len() / 3
        )
    };
    let lo = (c as u8) as usize;
    if let Some(ptr) = block.get(lo) {
        let l = ptr[2];
        let len = if (l & 0x80) == 0 { 3 } else { (l & 0x7f) as usize };
        if len <= 3 {
            unsafe {
                std::str::from_utf8_unchecked(ptr.get_unchecked(..len))
            }
        } else {
            let i = (((ptr[0] as u16) << 8) | (ptr[1] as u16)) as usize;
            unsafe {
                include_str!("strings.txt").get_unchecked(i..i + len)
            }
        }
    } else {
        ""
    }
}

#[test]
fn test() {
    fn check(s: &str, expected: &str) {
        assert_eq!(any_ascii(s), expected);
    }

    check("", "");
    check("René François Lacôte", "Rene Francois Lacote");
    check("Großer Hörselberg", "Grosser Horselberg");
    check("Trần Hưng Đạo", "Tran Hung Dao");
    check("Nærøy", "Naeroy");
    check("Φειδιππίδης", "Feidippidis");
    check("Δημήτρης Φωτόπουλος", "Dimitris Fotopoylos");
    check("Борис Николаевич Ельцин", "Boris Nikolaevich El'tsin");
    check("دمنهور", "dmnhwr");
    check("אברהם הלוי פרנקל", "'vrhm hlvy frnkl");
    check("სამტრედია", "samt'redia");
    check("Աբովյան", "Abovyan");
    check("สงขลา", "sngkhla");
    check("ສະຫວັນນະເຂດ", "sahvannaekhd");
    check("深圳", "ShenZhen");
    check("深水埗", "ShenShuiBu");
    check("화성시", "hwaseongsi");
    check("華城市", "HuaChengShi");
    check("さいたま", "saitama");
    check("埼玉県", "QiYuXian");
    check("トヨタ", "toyota");
    check("⠠⠎⠁⠽⠀⠭⠀⠁⠛", "^say x ag");
    check("ময়মনসিংহ", "mymnsimh");
    check("પોરબંદર", "porbmdr");
    check("महासमुंद", "mhasmumd");
    check("ಬೆಂಗಳೂರು", "bemgluru");
    check("കളമശ്ശേരി", "klmsseri");
    check("ਜਲੰਧਰ", "jlmdhr");
    check("ଗଜପତି", "gjpti");
    check("රත්නපුර", "rtnpur");
    check("கன்னியாகுமரி", "knniyakumri");
    check("శ్రీకాకుళం", "srikakulm");
    check("😎 👑 🍎", ":sunglasses: :crown: :apple:");
    check("☆ ♯ ♰ ⚄ ⛌", "* # + 5 X");
    check("№ ℳ ⅋ ⅍", "No M & A/S");
}