use unicode_id_start::{is_id_continue_unicode, is_id_start_unicode};
use crate::line_terminator::{CR, LF, LS, PS};
pub const EOF: char = '\0';
pub const ZWNJ: char = '\u{200c}';
pub const ZWJ: char = '\u{200d}';
pub const ZWNBSP: char = '\u{feff}';
pub const TAB: char = '\u{9}';
pub const VT: char = '\u{b}';
pub const FF: char = '\u{c}';
pub const SP: char = '\u{20}';
pub const NBSP: char = '\u{a0}';
const NEL: char = '\u{85}';
const OGHAM_SPACE_MARK: char = '\u{1680}';
const EN_QUAD: char = '\u{2000}';
const ZWSP: char = '\u{200b}';
const NNBSP: char = '\u{202f}';
const MMSP: char = '\u{205f}';
const IDEOGRAPHIC_SPACE: char = '\u{3000}';
fn is_unicode_space_separator(c: char) -> bool {
c.is_whitespace() && !matches!(c, TAB | LF | VT | FF | CR | NEL | LS | PS)
}
pub fn is_white_space(c: char) -> bool {
matches!(c, TAB | VT | FF | ZWNBSP) || is_unicode_space_separator(c)
}
#[rustfmt::skip]
pub fn is_irregular_whitespace(c: char) -> bool {
matches!(c,
VT | FF | NBSP | ZWNBSP | NEL | OGHAM_SPACE_MARK
| EN_QUAD..=ZWSP | NNBSP | MMSP | IDEOGRAPHIC_SPACE
)
}
pub fn is_white_space_single_line(c: char) -> bool {
matches!(c, SP | TAB) || is_irregular_whitespace(c)
}
const ID_START: u8 = 1;
const ID_CONTINUE: u8 = 2;
#[repr(C, align(64))]
pub struct Align64<T>(pub(crate) T);
#[rustfmt::skip]
pub static ASCII_ID_FLAGS: Align64<[u8; 128]> = Align64([
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, ]);
#[inline]
pub fn is_identifier_start(c: char) -> bool {
if c.is_ascii() {
return is_identifier_start_ascii(c);
}
is_identifier_start_unicode(c)
}
#[inline]
pub fn is_identifier_start_ascii(c: char) -> bool {
ASCII_ID_FLAGS.0[c as usize] & ID_START != 0
}
#[inline]
pub fn is_identifier_start_unicode(c: char) -> bool {
is_id_start_unicode(c)
}
#[inline]
pub fn is_identifier_part(c: char) -> bool {
if c.is_ascii() {
return is_identifier_part_ascii(c);
}
is_identifier_part_unicode(c)
}
#[inline]
pub fn is_identifier_part_ascii(c: char) -> bool {
ASCII_ID_FLAGS.0[c as usize] & ID_CONTINUE != 0
}
#[inline]
pub fn is_identifier_part_unicode(c: char) -> bool {
is_id_continue_unicode(c) || c == ZWNJ || c == ZWJ
}
const KATAKANA_MIDDLE_DOT: char = '・';
const HALFWIDTH_KATAKANA_MIDDLE_DOT: char = '・';
pub fn is_identifier_name(name: &str) -> bool {
is_identifier_name_impl::<false>(name)
}
pub fn is_identifier_name_patched(name: &str) -> bool {
is_identifier_name_impl::<true>(name)
}
fn is_identifier_name_impl<const PATCHED: bool>(name: &str) -> bool {
let bytes = name.as_bytes();
let Some(&first_byte) = bytes.first() else { return false };
let mut chars = if first_byte.is_ascii() {
if ASCII_ID_FLAGS.0[first_byte as usize] & ID_START == 0 {
return false;
}
let mut index = 1;
'outer: loop {
let bytes_remaining = bytes.len() - index;
if bytes_remaining >= 8 {
#[expect(clippy::cast_ptr_alignment)]
let next8_as_u64 = unsafe {
let ptr = bytes.as_ptr().add(index).cast::<u64>();
ptr.read_unaligned()
};
let high_bits = next8_as_u64 & 0x8080_8080_8080_8080;
if high_bits != 0 {
break;
}
let next8 = next8_as_u64.to_ne_bytes();
for b in next8 {
if ASCII_ID_FLAGS.0[b as usize] & ID_CONTINUE == 0 {
return false;
}
}
index += 8;
} else if bytes_remaining >= 4 {
#[expect(clippy::cast_ptr_alignment)]
let next4_as_u32 = unsafe {
let ptr = bytes.as_ptr().add(index).cast::<u32>();
ptr.read_unaligned()
};
let high_bits = next4_as_u32 & 0x8080_8080;
if high_bits != 0 {
break;
}
let next4 = next4_as_u32.to_ne_bytes();
for b in next4 {
if ASCII_ID_FLAGS.0[b as usize] & ID_CONTINUE == 0 {
return false;
}
}
index += 4;
} else {
loop {
let Some(&b) = bytes.get(index) else {
return true;
};
if b.is_ascii() {
if ASCII_ID_FLAGS.0[b as usize] & ID_CONTINUE == 0 {
return false;
}
} else {
break 'outer;
}
index += 1;
}
}
}
name[index..].chars()
} else {
let mut chars = name.chars();
let first_char = chars.next().unwrap();
if !is_identifier_start_unicode(first_char) {
return false;
}
chars
};
if PATCHED {
chars.all(|c| {
is_identifier_part(c) && c != KATAKANA_MIDDLE_DOT && c != HALFWIDTH_KATAKANA_MIDDLE_DOT
})
} else {
chars.all(is_identifier_part)
}
}
#[test]
fn is_identifier_name_true() {
let cases = [
"a",
"z",
"A",
"Z",
"_",
"$",
"µ", "ख", "𐀀", "az",
"AZ",
"_a",
"$Z",
"a0",
"A9",
"_0",
"$9",
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_$",
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_$",
"_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789$",
"$abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_",
"µख𐀀",
"AµBखC𐀀D",
"µAखB𐀀",
];
for str in cases {
assert!(is_identifier_name(str));
}
}
#[test]
fn is_identifier_name_false() {
let cases = [
"",
"0",
"9",
"-",
"~",
"+",
"£", "৸", "𐄬", "0a",
"9a",
"-a",
"+a",
"a-Z",
"A+z",
"a-",
"a+",
"£৸𐄬",
"A£",
"A৸",
"A𐄬",
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_$abc£",
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_$abc৸",
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_$abc𐄬",
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_$abc£abcdefghijklmnopqrstuvwxyz",
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_$abc৸abcdefghijklmnopqrstuvwxyz",
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_$abc𐄬abcdefghijklmnopqrstuvwxyz",
"£A",
"৸A",
"𐄬A",
];
for str in cases {
assert!(!is_identifier_name(str));
}
}
#[test]
fn is_identifier_name_patched_rejects_katakana_dots() {
assert!(is_identifier_name("x\u{30FB}"));
assert!(!is_identifier_name_patched("x\u{30FB}"));
assert!(is_identifier_name("x\u{FF65}"));
assert!(!is_identifier_name_patched("x\u{FF65}"));
assert!(!is_identifier_name("\u{30FB}"));
assert!(!is_identifier_name_patched("\u{30FB}"));
assert!(is_identifier_name_patched("foo"));
assert!(is_identifier_name_patched("_bar"));
assert!(is_identifier_name_patched("$baz"));
assert!(is_identifier_name_patched("µ"));
assert!(!is_identifier_name_patched(""));
}