oxc_syntax/
identifier.rs

1#![expect(missing_docs)] // fixme
2
3use unicode_id_start::{is_id_continue_unicode, is_id_start_unicode};
4
5use oxc_data_structures::assert_unchecked;
6
7pub const EOF: char = '\0';
8
9// 11.1 Unicode Format-Control Characters
10
11/// U+200C ZERO WIDTH NON-JOINER, abbreviated in the spec as `<ZWNJ>`.
12/// Specially permitted in identifiers.
13pub const ZWNJ: char = '\u{200c}';
14
15/// U+200D ZERO WIDTH JOINER, abbreviated as `<ZWJ>`.
16/// Specially permitted in identifiers.
17pub const ZWJ: char = '\u{200d}';
18
19/// U+FEFF ZERO WIDTH NO-BREAK SPACE, abbreviated `<ZWNBSP>`.
20/// Considered a whitespace character in JS.
21pub const ZWNBSP: char = '\u{feff}';
22
23// 11.2 White Space
24/// U+0009 CHARACTER TABULATION, abbreviated `<TAB>`.
25pub const TAB: char = '\u{9}';
26
27/// U+000B VERTICAL TAB, abbreviated `<VT>`.
28pub const VT: char = '\u{b}';
29
30/// U+000C FORM FEED, abbreviated `<FF>`.
31pub const FF: char = '\u{c}';
32
33/// U+0020 SPACE, abbreviated `<SP>`.
34pub const SP: char = '\u{20}';
35
36/// U+00A0 NON-BREAKING SPACE, abbreviated `<NBSP>`.
37pub const NBSP: char = '\u{a0}';
38
39// U+0085 NEXT LINE, abbreviated `<NEL>`.
40const NEL: char = '\u{85}';
41
42const OGHAM_SPACE_MARK: char = '\u{1680}';
43
44const EN_QUAD: char = '\u{2000}';
45
46// U+200B ZERO WIDTH SPACE, abbreviated `<ZWSP>`.
47const ZWSP: char = '\u{200b}';
48
49// Narrow NO-BREAK SPACE, abbreviated `<NNBSP>`.
50const NNBSP: char = '\u{202f}';
51
52// U+205F MEDIUM MATHEMATICAL SPACE, abbreviated `<MMSP>`.
53const MMSP: char = '\u{205f}';
54
55const IDEOGRAPHIC_SPACE: char = '\u{3000}';
56
57fn is_unicode_space_separator(c: char) -> bool {
58    // is_whitespace matches Unicode `White_Space` property
59    // exclude the characters that are included in `White_Space`, but not `Space_Separator`
60    // <https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BWhite_Space%7D%26%5CP%7BGeneral_Category%3DSpace_Separator%7D>
61    c.is_whitespace() && !matches!(c, TAB | LF | VT | FF | CR | NEL | LS | PS)
62}
63
64pub fn is_white_space(c: char) -> bool {
65    matches!(c, TAB | VT | FF | ZWNBSP) || is_unicode_space_separator(c)
66}
67
68// https://eslint.org/docs/latest/rules/no-irregular-whitespace#rule-details
69#[rustfmt::skip]
70pub fn is_irregular_whitespace(c: char) -> bool {
71    matches!(c,
72        VT | FF | NBSP | ZWNBSP | NEL | OGHAM_SPACE_MARK
73        | EN_QUAD..=ZWSP | NNBSP | MMSP | IDEOGRAPHIC_SPACE
74    )
75}
76
77// https://github.com/microsoft/TypeScript/blob/b8e4ed8aeb0b228f544c5736908c31f136a9f7e3/src/compiler/scanner.ts#L556
78pub fn is_white_space_single_line(c: char) -> bool {
79    // Note: nextLine is in the Zs space, and should be considered to be a whitespace.
80    // It is explicitly not a line-break as it isn't in the exact set specified by EcmaScript.
81    matches!(c, SP | TAB) || is_irregular_whitespace(c)
82}
83
84// 11.3 Line Terminators
85
86///  U+000A LINE FEED, abbreviated in the spec as `<LF>`.
87pub const LF: char = '\u{a}';
88
89/// U+000D CARRIAGE RETURN, abbreviated in the spec as `<CR>`.
90pub const CR: char = '\u{d}';
91
92/// U+2028 LINE SEPARATOR, abbreviated `<LS>`.
93pub const LS: char = '\u{2028}';
94
95/// U+2029 PARAGRAPH SEPARATOR, abbreviated `<PS>`.
96pub const PS: char = '\u{2029}';
97
98pub fn is_regular_line_terminator(c: char) -> bool {
99    matches!(c, LF | CR)
100}
101
102pub fn is_irregular_line_terminator(c: char) -> bool {
103    matches!(c, LS | PS)
104}
105
106pub fn is_line_terminator(c: char) -> bool {
107    is_regular_line_terminator(c) || is_irregular_line_terminator(c)
108}
109
110const XX: bool = true;
111const __: bool = false;
112
113#[repr(C, align(64))]
114pub struct Align64<T>(pub(crate) T);
115
116// `a`-`z`, `A`-`Z`, `$` (0x24), `_` (0x5F)
117#[rustfmt::skip]
118pub static ASCII_START: Align64<[bool; 128]> = Align64([
119//  0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F   //
120    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 0
121    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 1
122    __, __, __, __, XX, __, __, __, __, __, __, __, __, __, __, __, // 2
123    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 3
124    __, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 4
125    XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, __, __, __, __, XX, // 5
126    __, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 6
127    XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, __, __, __, __, __, // 7
128]);
129
130// `ASCII_START` + `0`-`9`
131#[rustfmt::skip]
132pub static ASCII_CONTINUE: Align64<[bool; 128]> = Align64([
133//  0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F   //
134    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 0
135    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 1
136    __, __, __, __, XX, __, __, __, __, __, __, __, __, __, __, __, // 2
137    XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, __, __, __, __, __, __, // 3
138    __, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 4
139    XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, __, __, __, __, XX, // 5
140    __, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 6
141    XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, __, __, __, __, __, // 7
142]);
143
144/// Section 12.7 Detect `IdentifierStartChar`
145#[inline]
146pub fn is_identifier_start(c: char) -> bool {
147    if c.is_ascii() {
148        return is_identifier_start_ascii(c);
149    }
150    is_identifier_start_unicode(c)
151}
152
153#[inline]
154pub fn is_identifier_start_ascii(c: char) -> bool {
155    ASCII_START.0[c as usize]
156}
157
158#[inline]
159pub fn is_identifier_start_unicode(c: char) -> bool {
160    is_id_start_unicode(c)
161}
162
163/// Section 12.7 Detect `IdentifierPartChar`
164/// NOTE 2: The nonterminal `IdentifierPart` derives _ via `UnicodeIDContinue`.
165#[inline]
166pub fn is_identifier_part(c: char) -> bool {
167    if c.is_ascii() {
168        return is_identifier_part_ascii(c);
169    }
170    is_identifier_part_unicode(c)
171}
172
173#[inline]
174pub fn is_identifier_part_ascii(c: char) -> bool {
175    ASCII_CONTINUE.0[c as usize]
176}
177
178#[inline]
179pub fn is_identifier_part_unicode(c: char) -> bool {
180    is_id_continue_unicode(c) || c == ZWNJ || c == ZWJ
181}
182
183/// Determine if a string is a valid JS identifier.
184#[expect(clippy::missing_panics_doc)]
185pub fn is_identifier_name(name: &str) -> bool {
186    // This function contains a fast path for ASCII (common case), iterating over bytes and using
187    // the cheap `is_identifier_start_ascii` and `is_identifier_part_ascii` to test bytes.
188    // Only if a Unicode char is found, fall back to iterating over `char`s, and using the more
189    // expensive `is_identifier_start_unicode` and `is_identifier_part`.
190    // As a further optimization, we test if bytes are ASCII in blocks of 8 or 4 bytes, rather than 1 by 1.
191
192    // Get first byte. Exit if empty string.
193    let bytes = name.as_bytes();
194    let Some(&first_byte) = bytes.first() else { return false };
195
196    let mut chars = if first_byte.is_ascii() {
197        // First byte is ASCII
198        if !is_identifier_start_ascii(first_byte as char) {
199            return false;
200        }
201
202        let mut index = 1;
203        'outer: loop {
204            // Check blocks of 8 bytes, then 4 bytes, then single bytes
205            let bytes_remaining = bytes.len() - index;
206            if bytes_remaining >= 8 {
207                // Process block of 8 bytes.
208                // Check that next 8 bytes are all ASCII.
209                // SAFETY: We checked above that there are at least 8 bytes to read starting at `index`
210                #[expect(clippy::cast_ptr_alignment)]
211                let next8_as_u64 = unsafe {
212                    let ptr = bytes.as_ptr().add(index).cast::<u64>();
213                    ptr.read_unaligned()
214                };
215                let high_bits = next8_as_u64 & 0x8080_8080_8080_8080;
216                if high_bits != 0 {
217                    // Some chars in this block are non-ASCII
218                    break;
219                }
220
221                let next8 = next8_as_u64.to_ne_bytes();
222                for b in next8 {
223                    // SAFETY: We just checked all these bytes are ASCII
224                    unsafe { assert_unchecked!(b.is_ascii()) };
225                    if !is_identifier_part_ascii(b as char) {
226                        return false;
227                    }
228                }
229
230                index += 8;
231            } else if bytes_remaining >= 4 {
232                // Process block of 4 bytes.
233                // Check that next 4 bytes are all ASCII.
234                // SAFETY: We checked above that there are at least 4 bytes to read starting at `index`
235                #[expect(clippy::cast_ptr_alignment)]
236                let next4_as_u32 = unsafe {
237                    let ptr = bytes.as_ptr().add(index).cast::<u32>();
238                    ptr.read_unaligned()
239                };
240                let high_bits = next4_as_u32 & 0x8080_8080;
241                if high_bits != 0 {
242                    // Some chars in this block are non-ASCII
243                    break;
244                }
245
246                let next4 = next4_as_u32.to_ne_bytes();
247                for b in next4 {
248                    // SAFETY: We just checked all these bytes are ASCII
249                    unsafe { assert_unchecked!(b.is_ascii()) };
250                    if !is_identifier_part_ascii(b as char) {
251                        return false;
252                    }
253                }
254
255                index += 4;
256            } else {
257                loop {
258                    let Some(&b) = bytes.get(index) else {
259                        // We got to the end with no non-identifier chars found
260                        return true;
261                    };
262
263                    if b.is_ascii() {
264                        if !is_identifier_part_ascii(b as char) {
265                            return false;
266                        }
267                    } else {
268                        // Unicode byte found
269                        break 'outer;
270                    }
271
272                    index += 1;
273                }
274            }
275        }
276
277        // Unicode byte found - search rest of string (from this byte onwards) as Unicode
278        name[index..].chars()
279    } else {
280        // First char is Unicode.
281        // NB: `unwrap()` cannot fail because we already checked the string is not empty.
282        let mut chars = name.chars();
283        let first_char = chars.next().unwrap();
284        if !is_identifier_start_unicode(first_char) {
285            return false;
286        }
287        // Search rest of string as Unicode
288        chars
289    };
290
291    // A Unicode char was found - search rest of string as Unicode
292    chars.all(is_identifier_part)
293}
294
295#[test]
296fn is_identifier_name_true() {
297    let cases = [
298        // 1 char ASCII
299        "a",
300        "z",
301        "A",
302        "Z",
303        "_",
304        "$",
305        // 1 char Unicode
306        "µ", // 2 bytes
307        "ख", // 3 bytes
308        "𐀀", // 4 bytes
309        // Multiple chars ASCII
310        "az",
311        "AZ",
312        "_a",
313        "$Z",
314        "a0",
315        "A9",
316        "_0",
317        "$9",
318        "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_$",
319        "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_$",
320        "_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789$",
321        "$abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_",
322        // Multiple chars Unicode
323        "µख𐀀",
324        // ASCII + Unicode, starting with ASCII
325        "AµBखC𐀀D",
326        // ASCII + Unicode, starting with Unicode
327        "µAखB𐀀",
328    ];
329
330    for str in cases {
331        assert!(is_identifier_name(str));
332    }
333}
334
335#[test]
336fn is_identifier_name_false() {
337    let cases = [
338        // Empty string
339        "",
340        // 1 char ASCII
341        "0",
342        "9",
343        "-",
344        "~",
345        "+",
346        // 1 char Unicode
347        "£", // 2 bytes
348        "৸", // 3 bytes
349        "𐄬", // 4 bytes
350        // Multiple chars ASCII
351        "0a",
352        "9a",
353        "-a",
354        "+a",
355        "a-Z",
356        "A+z",
357        "a-",
358        "a+",
359        // Multiple chars Unicode
360        "£৸𐄬",
361        // ASCII + Unicode, starting with ASCII
362        "A£",
363        "A৸",
364        "A𐄬",
365        "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_$abc£",
366        "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_$abc৸",
367        "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_$abc𐄬",
368        "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_$abc£abcdefghijklmnopqrstuvwxyz",
369        "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_$abc৸abcdefghijklmnopqrstuvwxyz",
370        "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_$abc𐄬abcdefghijklmnopqrstuvwxyz",
371        // ASCII + Unicode, starting with Unicode
372        "£A",
373        "৸A",
374        "𐄬A",
375    ];
376
377    for str in cases {
378        assert!(!is_identifier_name(str));
379    }
380}
oxc_syntax/identifier.rs

oxc_syntax/
identifier.rs