unicode_id_trie_rle/
lib.rs

1#![doc = include_str!("../README.md")]
2#![cfg_attr(not(test), no_std)]
3const IDENTIFIER_OTHER: u8 = 0;
4const IDENTIFIER_START: u8 = 1;
5const IDENTIFIER_CONTINUE: u8 = 2;
6const START_CODEPOINT: u32 = 0x80;
7
8include!(concat!(env!("OUT_DIR"), "/table.rs"));
9
10const BLOCK_MASK: u32 = (1 << SHIFT) - 1;
11const LOWER_MASK: u32 = (1 << LOWER_BITS) - 1;
12const ASCII_TABLE: [u8; 128] = ascii_table();
13
14#[derive(Clone, Copy)]
15struct Leaf {
16    offset: usize,
17    len: usize,
18}
19
20/// A Unicode identifier class, as returned by [unicode_identifier_class]. Use
21/// the [UnicodeIdentifierClass::is_start] and
22/// [UnicodeIdentifierClass::is_continue] methods to query specific properties.
23pub struct UnicodeIdentifierClass(u8);
24
25impl UnicodeIdentifierClass {
26    /// Returns whether or not the codepoint was one of the `*_Start`
27    /// identifiers.
28    #[inline]
29    pub fn is_start(&self) -> bool {
30        self.0 & IDENTIFIER_START != 0
31    }
32
33    /// Returns whether or not the codepoint was one of the `*_Continue`
34    /// identifiers.
35    #[inline]
36    pub fn is_continue(&self) -> bool {
37        self.0 & IDENTIFIER_CONTINUE != 0
38    }
39}
40
41#[inline]
42fn load_leaf(idx: usize) -> Leaf {
43    debug_assert!(idx + 1 < LEAF_OFFSETS.len());
44    let start = LEAF_OFFSETS[idx] as usize;
45    let end = LEAF_OFFSETS[idx + 1] as usize;
46    Leaf {
47        offset: start,
48        len: end - start,
49    }
50}
51
52#[inline]
53fn leaf_value(leaf: Leaf, offset: u16) -> UnicodeIdentifierClass {
54    debug_assert!(leaf.len >= 2);
55    let runs = &LEAF_RUN_STARTS[leaf.offset..leaf.offset + leaf.len];
56    let values = &LEAF_RUN_VALUES[leaf.offset..leaf.offset + leaf.len];
57    // runs are ascending with runs[0] == 0 and a sentinel at the end.
58    let idx = runs.partition_point(|&start| start <= offset);
59    UnicodeIdentifierClass(values[idx.saturating_sub(1)])
60}
61
62/// Returns whether the codepoint specified has the properties `ID_Start`,
63/// `XID_Start` or the properties `ID_Continue` or `XID_Continue`.
64#[inline]
65pub fn unicode_identifier_class(cp: char) -> UnicodeIdentifierClass {
66    // ASCII fast path via table to avoid unpredictable branches.
67    if (cp as u32) < START_CODEPOINT {
68        return UnicodeIdentifierClass(ASCII_TABLE[cp as usize]);
69    }
70
71    if (cp as u32) >= 0x100000 {
72        return UnicodeIdentifierClass(IDENTIFIER_OTHER);
73    }
74
75    let cp = cp as u32;
76    let block = cp >> SHIFT;
77    debug_assert!(block < BLOCK_COUNT as u32);
78    let top = (block >> LOWER_BITS) as usize;
79    let bottom = (block & LOWER_MASK) as usize;
80    let level2_idx = LEVEL1_TABLE[top] as usize;
81    let leaf_idx = LEVEL2_TABLES[level2_idx * LOWER_SIZE + bottom] as usize;
82    let leaf = load_leaf(leaf_idx);
83    let offset = (cp & BLOCK_MASK) as u16;
84    leaf_value(leaf, offset)
85}
86
87const fn ascii_table() -> [u8; 128] {
88    let mut table = [0u8; 128];
89    let mut c = b'A';
90    while c <= b'Z' {
91        table[c as usize] = IDENTIFIER_START | IDENTIFIER_CONTINUE;
92        c += 1;
93    }
94    c = b'a';
95    while c <= b'z' {
96        table[c as usize] = IDENTIFIER_START | IDENTIFIER_CONTINUE;
97        c += 1;
98    }
99    c = b'0';
100    while c <= b'9' {
101        table[c as usize] = IDENTIFIER_CONTINUE;
102        c += 1;
103    }
104    table[b'_' as usize] = IDENTIFIER_CONTINUE;
105    table
106}
107
108/// Checks if a codepoint is a unicode identifier, defined by
109/// Unicode Standard Annex #31.
110///
111/// This function implements the "Default Identifiers" specification, labeled
112/// `UAX31-R1-1` in the document, meaning the `XID_Start` and `XID_Continue`
113/// properties are used in determining whether something is a valid identifier.
114#[inline]
115pub fn is_identifier(cp: &[char]) -> bool {
116    if cp.is_empty() {
117        return false;
118    }
119
120    if !unicode_identifier_class(cp[0]).is_start() {
121        return false;
122    }
123
124    for (i, c) in cp.iter().enumerate() {
125        if !unicode_identifier_class(*c).is_continue() {
126            // the two special characters are only allowed in the
127            // middle, not the end.
128            if (*c != '\u{200c}' && *c != '\u{200d}') || i + 1 == cp.len() {
129                return false;
130            }
131        }
132    }
133
134    true
135}
136
137/// Checks if a given string is a unicode identifier, defined by Unicode
138/// Standard Annex #31.
139///
140/// This function implements the "Default Identifiers" specification, labeled
141/// `UAX31-R1-1` in the document, meaning the `XID_Start` and `XID_Continue`
142/// properties are used in determining whether something is a valid identifier.
143#[inline]
144pub fn str_is_identifier(s: &str) -> bool {
145    let mut iter = s.chars();
146    let Some(first) = iter.next() else {
147        return false;
148    };
149
150    if !unicode_identifier_class(first).is_start() {
151        return false;
152    }
153
154    let mut iter = iter.peekable();
155    while let Some(c) = iter.next() {
156        if !unicode_identifier_class(c).is_continue() {
157            // the two special characters are only allowed in the
158            // middle, not the end.
159            if (c != '\u{200c}' && c != '\u{200d}') || iter.peek().is_none() {
160                return false;
161            }
162        }
163    }
164
165    true
166}
167
168#[cfg(test)]
169mod tests {
170    use super::*;
171    use proptest::prelude::*;
172    use std::{fs::File, path::PathBuf, sync::OnceLock};
173
174    const MAX_SCALAR: usize = 0x110000;
175
176    fn derived_identifier_table() -> &'static [u8] {
177        static TABLE: OnceLock<Box<[u8]>> = OnceLock::new();
178        TABLE
179            .get_or_init(|| {
180                let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
181                let derived_path =
182                    manifest_dir.join("./DerivedCoreProperties.txt");
183                let file = File::open(&derived_path).unwrap_or_else(|err| {
184                    panic!("failed to open {}: {err}", derived_path.display())
185                });
186
187                let parsed =
188                    unicode_id_trie_rle_derived_core_properties::parse(file)
189                        .unwrap_or_else(|err| {
190                            panic!("failed to parse derived data: {err}")
191                        });
192                let mut table = vec![0u8; MAX_SCALAR];
193                for (ch, props) in parsed {
194                    let mut bits = 0u8;
195                    for prop in props {
196                        if prop.contains("XID_Start") {
197                            bits |= IDENTIFIER_START;
198                        }
199                        if prop.contains("XID_Continue") {
200                            bits |= IDENTIFIER_CONTINUE;
201                        }
202                    }
203
204                    table[ch as usize] = bits;
205                }
206
207                table.into_boxed_slice()
208            })
209            .as_ref()
210    }
211
212    #[test]
213    fn unicode_identifier_class_matches_derived_core_properties() {
214        let table = derived_identifier_table();
215        for cp in 0..=0x10ffff {
216            let Some(ch) = char::from_u32(cp) else {
217                continue;
218            };
219            let expected = table[ch as usize];
220            if cp >= 0x100000 {
221                assert_eq!(
222                    expected, 0,
223                    "derived data marks unsupported codepoint U+{cp:06X}"
224                );
225            }
226
227            let class = unicode_identifier_class(ch);
228            assert_eq!(
229                class.is_start(),
230                expected & IDENTIFIER_START != 0,
231                "ID_Start mismatch at U+{cp:04X}"
232            );
233            assert_eq!(
234                class.is_continue(),
235                expected & IDENTIFIER_CONTINUE != 0,
236                "ID_Continue mismatch at U+{cp:04X}"
237            );
238        }
239    }
240
241    proptest! {
242        #[test]
243        fn unicode_identifier_class_proptest(cp in any::<char>()) {
244            let expected = derived_identifier_table()[cp as usize];
245            if (cp as u32) >= 0x100000 {
246                prop_assert_eq!(
247                    expected, 0,
248                    "derived data marks unsupported codepoint U+{:06X}",
249                    cp as u32
250                );
251            }
252
253            let class = unicode_identifier_class(cp);
254            prop_assert_eq!(
255                class.is_start(),
256                expected & IDENTIFIER_START != 0,
257                "ID_Start mismatch at U+{:04X}",
258                cp as u32
259            );
260            prop_assert_eq!(
261                class.is_continue(),
262                expected & IDENTIFIER_CONTINUE != 0,
263                "ID_Continue mismatch at U+{:04X}",
264                cp as u32
265            );
266        }
267    }
268
269    proptest! {
270        #[test]
271        fn str_and_slice_identifier_agree(chars in prop::collection::vec(any::<char>(), 0..16)) {
272            let string: String = chars.iter().copied().collect();
273            prop_assert_eq!(
274                str_is_identifier(&string),
275                is_identifier(&chars),
276                "str/is_identifier disagreement on {:?}",
277                string
278            );
279        }
280    }
281}