lang_unicodes/
cjk_unicodes.rs

1use crate::expand_ranges;
2use lazy_static::lazy_static;
3
4// u8array 按照 u32 的方式读取
5pub fn u8_to_u32(arr: &[u8]) -> Vec<u32> {
6    assert!(arr.len() % 4 == 0, "File length is not a multiple of 4");
7
8    arr.chunks(4)
9        .map(|chunk| {
10            u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]) as u32
11        })
12        .collect()
13}
14// u8array 按照 u16 的方式读取
15pub fn u8_to_u16(arr: &[u8]) -> Vec<u16> {
16    assert!(arr.len() % 2 == 0, "File length is not a multiple of 2");
17    arr.chunks(2)
18        .map(|chunk| u16::from_le_bytes([chunk[0], chunk[1]]) as u16)
19        .collect()
20}
21
22static HANGUL_SYL_SOURCE: &[u8] = include_bytes!("../data/hangul-syl.dat");
23
24static CN_CHAR_RANK: &[u8] = include_bytes!("../data/cn_char_rank.dat");
25
26fn get_part_from_cn_pkg(part_no: u8) -> Option<Vec<u32>> {
27    let data = u8_to_u16(CN_CHAR_RANK);
28    let mut last_index = 0;
29    let mut part_no = part_no as isize;
30
31    for (i, &element) in data.iter().enumerate() {
32        if element == 0 {
33            part_no -= 1;
34            if part_no < 0 {
35                return Some(
36                    data[last_index..i]
37                        .to_vec()
38                        .into_iter()
39                        .map(|i| i as u32)
40                        .collect::<Vec<u32>>(),
41                );
42            }
43            last_index = i + 1;
44        }
45    }
46
47    if part_no == 0 {
48        return Some(
49            data[last_index..]
50                .to_vec()
51                .into_iter()
52                .map(|i| i as u32)
53                .collect::<Vec<u32>>(),
54        );
55    }
56    None
57}
58
59lazy_static! {
60    pub static ref ZH_COMMON: Vec<u32> = get_part_from_cn_pkg(0).unwrap();
61    pub static ref ZH_SC: Vec<u32> = get_part_from_cn_pkg(1).unwrap();
62    pub static ref ZH_TC: Vec<u32> = get_part_from_cn_pkg(2).unwrap();
63    pub static ref HANGUL_SYL: Vec<u32> =
64        u8_to_u16(HANGUL_SYL_SOURCE).into_iter().map(|x| x as u32).collect();
65    pub static ref HIRAGANA_AND_KATAKANA: Vec<u32> =
66        expand_ranges(&[(0x3040, 0x309F), (0x30A0, 0x30FF)]);
67    pub static ref HANGUL_JAMO: Vec<u32> = expand_ranges(&[(0x1100, 0x11FF)]);
68}
69
70#[cfg(test)]
71mod tests {
72    use super::*;
73    #[test]
74    fn test() {
75        assert_eq!(HIRAGANA_AND_KATAKANA.len(), 192);
76        assert_eq!(ZH_COMMON.len(), 4524);
77        assert_eq!(ZH_SC.len(), 2313);
78        assert_eq!(ZH_TC.len(), 2308);
79        assert_eq!(HANGUL_SYL.len(), 2026);
80        println!(
81            "{}",
82            ZH_COMMON
83                .iter()
84                .map(|i| { std::char::from_u32(i.clone()).unwrap() })
85                .collect::<String>()
86        )
87    }
88}