unicode_id_trie_rle/
lib.rs1#![doc = include_str!("../README.md")]
2#![cfg_attr(not(test), no_std)]
3const IDENTIFIER_OTHER: u8 = 0;
4const IDENTIFIER_START: u8 = 1;
5const IDENTIFIER_CONTINUE: u8 = 2;
6const START_CODEPOINT: u32 = 0x80;
7
8include!(concat!(env!("OUT_DIR"), "/table.rs"));
9
10const BLOCK_MASK: u32 = (1 << SHIFT) - 1;
11const LOWER_MASK: u32 = (1 << LOWER_BITS) - 1;
12const ASCII_TABLE: [u8; 128] = ascii_table();
13
14#[derive(Clone, Copy)]
15struct Leaf {
16 offset: usize,
17 len: usize,
18}
19
20pub struct UnicodeIdentifierClass(u8);
24
25impl UnicodeIdentifierClass {
26 #[inline]
29 pub fn is_start(&self) -> bool {
30 self.0 & IDENTIFIER_START != 0
31 }
32
33 #[inline]
36 pub fn is_continue(&self) -> bool {
37 self.0 & IDENTIFIER_CONTINUE != 0
38 }
39}
40
41#[inline]
42fn load_leaf(idx: usize) -> Leaf {
43 debug_assert!(idx + 1 < LEAF_OFFSETS.len());
44 let start = LEAF_OFFSETS[idx] as usize;
45 let end = LEAF_OFFSETS[idx + 1] as usize;
46 Leaf {
47 offset: start,
48 len: end - start,
49 }
50}
51
52#[inline]
53fn leaf_value(leaf: Leaf, offset: u16) -> UnicodeIdentifierClass {
54 debug_assert!(leaf.len >= 2);
55 let runs = &LEAF_RUN_STARTS[leaf.offset..leaf.offset + leaf.len];
56 let values = &LEAF_RUN_VALUES[leaf.offset..leaf.offset + leaf.len];
57 let idx = runs.partition_point(|&start| start <= offset);
59 UnicodeIdentifierClass(values[idx.saturating_sub(1)])
60}
61
62#[inline]
65pub fn unicode_identifier_class(cp: char) -> UnicodeIdentifierClass {
66 if (cp as u32) < START_CODEPOINT {
68 return UnicodeIdentifierClass(ASCII_TABLE[cp as usize]);
69 }
70
71 if (cp as u32) >= 0x100000 {
72 return UnicodeIdentifierClass(IDENTIFIER_OTHER);
73 }
74
75 let cp = cp as u32;
76 let block = cp >> SHIFT;
77 debug_assert!(block < BLOCK_COUNT as u32);
78 let top = (block >> LOWER_BITS) as usize;
79 let bottom = (block & LOWER_MASK) as usize;
80 let level2_idx = LEVEL1_TABLE[top] as usize;
81 let leaf_idx = LEVEL2_TABLES[level2_idx * LOWER_SIZE + bottom] as usize;
82 let leaf = load_leaf(leaf_idx);
83 let offset = (cp & BLOCK_MASK) as u16;
84 leaf_value(leaf, offset)
85}
86
87const fn ascii_table() -> [u8; 128] {
88 let mut table = [0u8; 128];
89 let mut c = b'A';
90 while c <= b'Z' {
91 table[c as usize] = IDENTIFIER_START | IDENTIFIER_CONTINUE;
92 c += 1;
93 }
94 c = b'a';
95 while c <= b'z' {
96 table[c as usize] = IDENTIFIER_START | IDENTIFIER_CONTINUE;
97 c += 1;
98 }
99 c = b'0';
100 while c <= b'9' {
101 table[c as usize] = IDENTIFIER_CONTINUE;
102 c += 1;
103 }
104 table[b'_' as usize] = IDENTIFIER_CONTINUE;
105 table
106}
107
108#[inline]
115pub fn is_identifier(cp: &[char]) -> bool {
116 if cp.is_empty() {
117 return false;
118 }
119
120 if !unicode_identifier_class(cp[0]).is_start() {
121 return false;
122 }
123
124 for (i, c) in cp.iter().enumerate() {
125 if !unicode_identifier_class(*c).is_continue() {
126 if (*c != '\u{200c}' && *c != '\u{200d}') || i + 1 == cp.len() {
129 return false;
130 }
131 }
132 }
133
134 true
135}
136
137#[inline]
144pub fn str_is_identifier(s: &str) -> bool {
145 let mut iter = s.chars();
146 let Some(first) = iter.next() else {
147 return false;
148 };
149
150 if !unicode_identifier_class(first).is_start() {
151 return false;
152 }
153
154 let mut iter = iter.peekable();
155 while let Some(c) = iter.next() {
156 if !unicode_identifier_class(c).is_continue() {
157 if (c != '\u{200c}' && c != '\u{200d}') || iter.peek().is_none() {
160 return false;
161 }
162 }
163 }
164
165 true
166}
167
168#[cfg(test)]
169mod tests {
170 use super::*;
171 use proptest::prelude::*;
172 use std::{fs::File, path::PathBuf, sync::OnceLock};
173
174 const MAX_SCALAR: usize = 0x110000;
175
176 fn derived_identifier_table() -> &'static [u8] {
177 static TABLE: OnceLock<Box<[u8]>> = OnceLock::new();
178 TABLE
179 .get_or_init(|| {
180 let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
181 let derived_path =
182 manifest_dir.join("./DerivedCoreProperties.txt");
183 let file = File::open(&derived_path).unwrap_or_else(|err| {
184 panic!("failed to open {}: {err}", derived_path.display())
185 });
186
187 let parsed =
188 unicode_id_trie_rle_derived_core_properties::parse(file)
189 .unwrap_or_else(|err| {
190 panic!("failed to parse derived data: {err}")
191 });
192 let mut table = vec![0u8; MAX_SCALAR];
193 for (ch, props) in parsed {
194 let mut bits = 0u8;
195 for prop in props {
196 if prop.contains("XID_Start") {
197 bits |= IDENTIFIER_START;
198 }
199 if prop.contains("XID_Continue") {
200 bits |= IDENTIFIER_CONTINUE;
201 }
202 }
203
204 table[ch as usize] = bits;
205 }
206
207 table.into_boxed_slice()
208 })
209 .as_ref()
210 }
211
212 #[test]
213 fn unicode_identifier_class_matches_derived_core_properties() {
214 let table = derived_identifier_table();
215 for cp in 0..=0x10ffff {
216 let Some(ch) = char::from_u32(cp) else {
217 continue;
218 };
219 let expected = table[ch as usize];
220 if cp >= 0x100000 {
221 assert_eq!(
222 expected, 0,
223 "derived data marks unsupported codepoint U+{cp:06X}"
224 );
225 }
226
227 let class = unicode_identifier_class(ch);
228 assert_eq!(
229 class.is_start(),
230 expected & IDENTIFIER_START != 0,
231 "ID_Start mismatch at U+{cp:04X}"
232 );
233 assert_eq!(
234 class.is_continue(),
235 expected & IDENTIFIER_CONTINUE != 0,
236 "ID_Continue mismatch at U+{cp:04X}"
237 );
238 }
239 }
240
241 proptest! {
242 #[test]
243 fn unicode_identifier_class_proptest(cp in any::<char>()) {
244 let expected = derived_identifier_table()[cp as usize];
245 if (cp as u32) >= 0x100000 {
246 prop_assert_eq!(
247 expected, 0,
248 "derived data marks unsupported codepoint U+{:06X}",
249 cp as u32
250 );
251 }
252
253 let class = unicode_identifier_class(cp);
254 prop_assert_eq!(
255 class.is_start(),
256 expected & IDENTIFIER_START != 0,
257 "ID_Start mismatch at U+{:04X}",
258 cp as u32
259 );
260 prop_assert_eq!(
261 class.is_continue(),
262 expected & IDENTIFIER_CONTINUE != 0,
263 "ID_Continue mismatch at U+{:04X}",
264 cp as u32
265 );
266 }
267 }
268
269 proptest! {
270 #[test]
271 fn str_and_slice_identifier_agree(chars in prop::collection::vec(any::<char>(), 0..16)) {
272 let string: String = chars.iter().copied().collect();
273 prop_assert_eq!(
274 str_is_identifier(&string),
275 is_identifier(&chars),
276 "str/is_identifier disagreement on {:?}",
277 string
278 );
279 }
280 }
281}