sourcemap/
js_identifiers.rs

1/// Returns true if `c` is a valid character for an identifier start.
2fn is_valid_start(c: char) -> bool {
3    c == '$' || c == '_' || c.is_ascii_alphabetic() || {
4        if c.is_ascii() {
5            false
6        } else {
7            unicode_id_start::is_id_start_unicode(c)
8        }
9    }
10}
11
12/// Returns true if `c` is a valid character for an identifier part after start.
13fn is_valid_continue(c: char) -> bool {
14    // As specified by the ECMA-262 spec, U+200C (ZERO WIDTH NON-JOINER) and U+200D
15    // (ZERO WIDTH JOINER) are format-control characters that are used to make necessary
16    // distinctions when forming words or phrases in certain languages. They are however
17    // not considered by UnicodeID to be universally valid identifier characters.
18    c == '$' || c == '_' || c == '\u{200c}' || c == '\u{200d}' || c.is_ascii_alphanumeric() || {
19        if c.is_ascii() {
20            false
21        } else {
22            unicode_id_start::is_id_continue_unicode(c)
23        }
24    }
25}
26
27fn strip_identifier(s: &str) -> Option<&str> {
28    let mut iter = s.char_indices();
29    // Is the first character a valid starting character
30    let first_char_len = match iter.next() {
31        Some((_, c)) => {
32            if !is_valid_start(c) {
33                return None;
34            }
35            c.len_utf8()
36        }
37        None => {
38            return None;
39        }
40    };
41    // Slice up to the last valid continuation character
42    // Initialize to end of first char to handle single-char and multibyte identifiers correctly
43    let mut end_idx = first_char_len;
44    for (i, c) in iter {
45        if is_valid_continue(c) {
46            // Store the end byte index (start + char length) for proper UTF-8 boundary
47            end_idx = i + c.len_utf8();
48        } else {
49            break;
50        }
51    }
52    Some(&s[..end_idx])
53}
54
55pub fn is_valid_javascript_identifier(s: &str) -> bool {
56    // check stripping does not reduce the length of the token
57    strip_identifier(s).map_or(0, |t| t.len()) == s.len()
58}
59
60/// Finds the first valid identifier in the JS Source string given, provided
61/// the string begins with the identifier or whitespace.
62pub fn get_javascript_token(source_line: &str) -> Option<&str> {
63    match source_line.split_whitespace().next() {
64        Some(s) => strip_identifier(s),
65        None => None,
66    }
67}
68
69#[cfg(test)]
70mod tests {
71    use super::*;
72
73    #[test]
74    fn test_is_valid_javascript_identifier() {
75        // assert_eq!(is_valid_javascript_identifier("foo 123"));
76        assert!(is_valid_javascript_identifier("foo_$123"));
77        assert!(!is_valid_javascript_identifier(" foo"));
78        assert!(!is_valid_javascript_identifier("foo "));
79        assert!(!is_valid_javascript_identifier("[123]"));
80        assert!(!is_valid_javascript_identifier("foo.bar"));
81
82        // Non-ASCII identifiers
83        assert!(is_valid_javascript_identifier("한글변수"));
84        assert!(is_valid_javascript_identifier("变量名"));
85        assert!(is_valid_javascript_identifier("ひらがな"));
86
87        // Should these pass?
88        // assert!(is_valid_javascript_identifier("foo [bar]"));
89        assert_eq!(get_javascript_token("foo "), Some("foo"));
90        assert_eq!(get_javascript_token("f _hi"), Some("f"));
91        assert_eq!(get_javascript_token("foo.bar"), Some("foo"));
92        assert_eq!(get_javascript_token("[foo,bar]"), None);
93        assert_eq!(
94            get_javascript_token("결제사_연결():De"),
95            Some("결제사_연결")
96        );
97        assert_eq!(get_javascript_token("变量名123"), Some("变量名123"));
98        assert_eq!(get_javascript_token("へんすう_test"), Some("へんすう_test"));
99    }
100}