novel_cli/utils/
unicode.rs

1pub static CONVERT_MAP: phf::Map<char, char> = phf::phf_map! {
2    '"' => '"',
3    '#' => '#',
4    '$' => '$',
5    '%' => '%',
6    '&' => '&',
7    ''' => '\'',
8    '*' => '*',
9    '+' => '+',
10    '.' => '.',
11    '/' => '/',
12    '0' => '0',
13    '1' => '1',
14    '2' => '2',
15    '3' => '3',
16    '4' => '4',
17    '5' => '5',
18    '6' => '6',
19    '7' => '7',
20    '8' => '8',
21    '9' => '9',
22    '<' => '<',
23    '=' => '=',
24    '>' => '>',
25    '@' => '@',
26    'A' => 'A',
27    'B' => 'B',
28    'C' => 'C',
29    'D' => 'D',
30    'E' => 'E',
31    'F' => 'F',
32    'G' => 'G',
33    'H' => 'H',
34    'I' => 'I',
35    'J' => 'J',
36    'K' => 'K',
37    'L' => 'L',
38    'M' => 'M',
39    'N' => 'N',
40    'O' => 'O',
41    'P' => 'P',
42    'Q' => 'Q',
43    'R' => 'R',
44    'S' => 'S',
45    'T' => 'T',
46    'U' => 'U',
47    'V' => 'V',
48    'W' => 'W',
49    'X' => 'X',
50    'Y' => 'Y',
51    'Z' => 'Z',
52    '\' => '\\',
53    '^' => '^',
54    '`' => '`',
55    'a' => 'a',
56    'b' => 'b',
57    'c' => 'c',
58    'd' => 'd',
59    'e' => 'e',
60    'f' => 'f',
61    'g' => 'g',
62    'h' => 'h',
63    'i' => 'i',
64    'j' => 'j',
65    'k' => 'k',
66    'l' => 'l',
67    'm' => 'm',
68    'n' => 'n',
69    'o' => 'o',
70    'p' => 'p',
71    'q' => 'q',
72    'r' => 'r',
73    's' => 's',
74    't' => 't',
75    'u' => 'u',
76    'v' => 'v',
77    'w' => 'w',
78    'x' => 'x',
79    'y' => 'y',
80    'z' => 'z',
81    '{' => '{',
82    '|' => '|',
83    '}' => '}',
84    '。' => '。',
85    '「' => '「',
86    '」' => '」',
87    '、' => '、',
88    '・' => '·',
89    '•' => '·',
90    '─' => '—',
91    '―' => '—',
92    '∶' => ':',
93    '‧' => '·',
94    '・' => '·',
95    '﹑' => '、',
96    '〜' => '~',
97    '︰' => ':',
98    '?' => '?',
99    '!' => '!',
100    ',' => ',',
101    ';' => ';',
102    '(' => '(',
103    ')' => ')',
104};
105
106// https://zh.wiktionary.org/wiki/
107pub static CONVERT_T2S_MAP: phf::Map<char, char> = phf::phf_map! {
108    '妳' => '你',
109    '姊' => '姐',
110    '擡' => '抬',
111    '牠' => '它',
112    '緖' => '绪',
113    '揹' => '背',
114};
115
116// https://zh.wikipedia.org/wiki/%E4%B8%AD%E6%97%A5%E9%9F%93%E7%B5%B1%E4%B8%80%E8%A1%A8%E6%84%8F%E6%96%87%E5%AD%97
117// Unicode 15.1
118#[must_use]
119#[inline]
120pub const fn is_cjk(c: char) -> bool {
121    c == '\u{3007}'
122        || range(c, '\u{3400}', '\u{4DBF}')
123        || range(c, '\u{4E00}', '\u{9FFF}')
124        || range(c, '\u{FA0E}', '\u{FA0F}')
125        || c == '\u{FA11}'
126        || range(c, '\u{FA13}', '\u{FA14}')
127        || c == '\u{FA1F}'
128        || c == '\u{FA21}'
129        || range(c, '\u{FA23}', '\u{FA24}')
130        || range(c, '\u{FA27}', '\u{FA29}')
131        || range(c, '\u{20000}', '\u{2A6DF}')
132        || range(c, '\u{2A700}', '\u{2B739}')
133        || range(c, '\u{2B740}', '\u{2B81D}')
134        || range(c, '\u{2B820}', '\u{2CEA1}')
135        || range(c, '\u{2CEB0}', '\u{2EBE0}')
136        || range(c, '\u{2EBF0}', '\u{2EE5F}')
137        || range(c, '\u{30000}', '\u{3134A}')
138        || range(c, '\u{31350}', '\u{323AF}')
139}
140
141#[must_use]
142#[inline]
143const fn range(c: char, min: char, max: char) -> bool {
144    c >= min && c <= max
145}
146
147// https://zh.wikipedia.org/wiki/%E6%A0%87%E7%82%B9%E7%AC%A6%E5%8F%B7
148static CHINESE_PUNCTUATION: phf::Set<char> = phf::phf_set! {
149    '。',
150    '?',
151    '!',
152    ',',
153    '、',
154    ';',
155    ':',
156    '“',
157    '”',
158    '『',
159    '』',
160    '‘',
161    '’',
162    '「',
163    '」',
164    '(',
165    ')',
166    '[',
167    ']',
168    '〔',
169    '〕',
170    '【',
171    '】',
172    // ——
173    '—',
174    // ……
175    '…',
176    '-',
177    '-',
178    '~',
179    '·',
180    '《',
181    '》',
182    '〈',
183    '〉',
184    // ﹏﹏
185    '﹏',
186    // __
187    '_',
188    '.'
189};
190
191#[must_use]
192#[inline]
193pub fn is_chinese_punctuation(c: char) -> bool {
194    CHINESE_PUNCTUATION.contains(&c)
195}
196
197// https://zh.wikipedia.org/wiki/%E6%A0%87%E7%82%B9%E7%AC%A6%E5%8F%B7
198static ENGLISH_PUNCTUATION: phf::Set<char> = phf::phf_set! {
199    '.',
200    '?',
201    '!',
202    ',',
203    ':',
204    '…',
205    ';',
206    '-',
207    '–',
208    '—',
209    '(',
210    ')',
211    '[',
212    ']',
213    '{',
214    '}',
215    '"',
216    '\'',
217    '/',
218};
219
220#[must_use]
221#[inline]
222pub fn is_english_punctuation(c: char) -> bool {
223    ENGLISH_PUNCTUATION.contains(&c)
224}
225
226#[must_use]
227#[inline]
228pub fn is_punctuation(c: char) -> bool {
229    is_chinese_punctuation(c) || is_english_punctuation(c)
230}
231
232#[cfg(test)]
233mod test {
234    use super::*;
235
236    #[test]
237    fn test_is_cjk() {
238        assert!(is_cjk('你'));
239        assert!(is_cjk('〇'));
240        assert!(is_cjk('䀹'));
241        assert!(is_cjk('鿃'));
242        assert!(is_cjk('\u{9FEB}'));
243        assert!(is_cjk('﨧'));
244        assert!(is_cjk('𱞈'));
245
246        assert!(!is_cjk('a'));
247        assert!(!is_cjk('🍌'));
248    }
249}