use std::sync::atomic::{AtomicU64, Ordering};
use unicode_ident::{is_xid_continue, is_xid_start};
static UNICODE_CHAR_CHECKS: AtomicU64 = AtomicU64::new(0);
static UNICODE_EMOJI_HITS: AtomicU64 = AtomicU64::new(0);
#[allow(dead_code)]
pub fn get_unicode_stats() -> (u64, u64) {
(UNICODE_CHAR_CHECKS.load(Ordering::Relaxed), UNICODE_EMOJI_HITS.load(Ordering::Relaxed))
}
#[allow(dead_code)]
pub fn reset_unicode_stats() {
UNICODE_CHAR_CHECKS.store(0, Ordering::Relaxed);
UNICODE_EMOJI_HITS.store(0, Ordering::Relaxed);
}
fn is_emoji_codepoint(ch_u32: u32) -> bool {
matches!(ch_u32,
0x1F000..=0x1F02F | 0x1F0A0..=0x1F0FF | 0x1F100..=0x1F1FF | 0x1F200..=0x1F2FF | 0x1F300..=0x1F6FF | 0x1F700..=0x1F77F | 0x1F780..=0x1F7FF | 0x1F800..=0x1F8FF | 0x1F900..=0x1F9FF | 0x1FA00..=0x1FA6F | 0x1FA70..=0x1FAFF | 0x2600..=0x26FF | 0x2700..=0x27BF )
}
pub fn is_perl_identifier_start(ch: char) -> bool {
UNICODE_CHAR_CHECKS.fetch_add(1, Ordering::Relaxed);
if ch == '_' || is_xid_start(ch) {
return true;
}
let is_emoji = is_emoji_codepoint(ch as u32);
if is_emoji {
UNICODE_EMOJI_HITS.fetch_add(1, Ordering::Relaxed);
}
is_emoji
}
pub fn is_perl_identifier_continue(ch: char) -> bool {
is_perl_identifier_start(ch)
|| is_xid_continue(ch)
|| ch == '\''
|| matches!(
ch as u32,
0x200C | 0x200D |
0xFE00..=0xFE0F |
0x1F3FB..=0x1F3FF
)
}
#[allow(dead_code)]
pub fn analyze_unicode_complexity(text: &str) -> (usize, usize, usize) {
let mut char_count = 0;
let mut emoji_count = 0;
let mut complex_char_count = 0;
for ch in text.chars() {
char_count += 1;
let ch_u32 = ch as u32;
if is_emoji_codepoint(ch_u32) {
emoji_count += 1;
}
if ch_u32 > 0xFFFF || ch.len_utf8() > 2 {
complex_char_count += 1;
}
}
(char_count, emoji_count, complex_char_count)
}
#[cfg(test)]
mod tests {
use super::{
analyze_unicode_complexity, get_unicode_stats, is_perl_identifier_continue,
is_perl_identifier_start, reset_unicode_stats,
};
#[test]
fn identifier_start_accepts_ascii_xid_and_emoji() {
reset_unicode_stats();
assert!(is_perl_identifier_start('_'));
assert!(is_perl_identifier_start('A'));
assert!(is_perl_identifier_start('λ'));
assert!(is_perl_identifier_start('🚀'));
assert!(!is_perl_identifier_start('1'));
let (checks, emoji_hits) = get_unicode_stats();
assert_eq!(checks, 5);
assert_eq!(emoji_hits, 1);
}
#[test]
fn identifier_start_rejects_punctuation() {
assert!(!is_perl_identifier_start('-'));
}
#[test]
fn identifier_continue_accepts_joiners_selectors_and_modifiers() {
assert!(is_perl_identifier_continue('\''));
assert!(is_perl_identifier_continue('\u{200C}'));
assert!(is_perl_identifier_continue('\u{200D}'));
assert!(is_perl_identifier_continue('\u{FE0F}'));
assert!(is_perl_identifier_continue('\u{1F3FB}'));
assert!(is_perl_identifier_continue('\u{1F3FD}'));
assert!(!is_perl_identifier_continue(' '));
assert!(!is_perl_identifier_continue('-'));
}
#[test]
fn analyze_complexity_counts_chars_emoji_and_non_bmp() {
let (char_count, emoji_count, complex_char_count) =
analyze_unicode_complexity("aλ🚀\u{FE0F}");
assert_eq!(char_count, 4);
assert_eq!(emoji_count, 1);
assert_eq!(complex_char_count, 2);
}
}