lang_unicodes/
cjk_unicodes.rs1use crate::expand_ranges;
2use lazy_static::lazy_static;
3
4pub fn u8_to_u32(arr: &[u8]) -> Vec<u32> {
6 assert!(arr.len() % 4 == 0, "File length is not a multiple of 4");
7
8 arr.chunks(4)
9 .map(|chunk| {
10 u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]) as u32
11 })
12 .collect()
13}
14pub fn u8_to_u16(arr: &[u8]) -> Vec<u16> {
16 assert!(arr.len() % 2 == 0, "File length is not a multiple of 2");
17 arr.chunks(2)
18 .map(|chunk| u16::from_le_bytes([chunk[0], chunk[1]]) as u16)
19 .collect()
20}
21
22static HANGUL_SYL_SOURCE: &[u8] = include_bytes!("../data/hangul-syl.dat");
23
24static CN_CHAR_RANK: &[u8] = include_bytes!("../data/cn_char_rank.dat");
25
26fn get_part_from_cn_pkg(part_no: u8) -> Option<Vec<u32>> {
27 let data = u8_to_u16(CN_CHAR_RANK);
28 let mut last_index = 0;
29 let mut part_no = part_no as isize;
30
31 for (i, &element) in data.iter().enumerate() {
32 if element == 0 {
33 part_no -= 1;
34 if part_no < 0 {
35 return Some(
36 data[last_index..i]
37 .to_vec()
38 .into_iter()
39 .map(|i| i as u32)
40 .collect::<Vec<u32>>(),
41 );
42 }
43 last_index = i + 1;
44 }
45 }
46
47 if part_no == 0 {
48 return Some(
49 data[last_index..]
50 .to_vec()
51 .into_iter()
52 .map(|i| i as u32)
53 .collect::<Vec<u32>>(),
54 );
55 }
56 None
57}
58
59lazy_static! {
60 pub static ref ZH_COMMON: Vec<u32> = get_part_from_cn_pkg(0).unwrap();
61 pub static ref ZH_SC: Vec<u32> = get_part_from_cn_pkg(1).unwrap();
62 pub static ref ZH_TC: Vec<u32> = get_part_from_cn_pkg(2).unwrap();
63 pub static ref HANGUL_SYL: Vec<u32> =
64 u8_to_u16(HANGUL_SYL_SOURCE).into_iter().map(|x| x as u32).collect();
65 pub static ref HIRAGANA_AND_KATAKANA: Vec<u32> =
66 expand_ranges(&[(0x3040, 0x309F), (0x30A0, 0x30FF)]);
67 pub static ref HANGUL_JAMO: Vec<u32> = expand_ranges(&[(0x1100, 0x11FF)]);
68}
69
70#[cfg(test)]
71mod tests {
72 use super::*;
73 #[test]
74 fn test() {
75 assert_eq!(HIRAGANA_AND_KATAKANA.len(), 192);
76 assert_eq!(ZH_COMMON.len(), 4524);
77 assert_eq!(ZH_SC.len(), 2313);
78 assert_eq!(ZH_TC.len(), 2308);
79 assert_eq!(HANGUL_SYL.len(), 2026);
80 println!(
81 "{}",
82 ZH_COMMON
83 .iter()
84 .map(|i| { std::char::from_u32(i.clone()).unwrap() })
85 .collect::<String>()
86 )
87 }
88}