use regex::Regex;
use std::sync::LazyLock;
pub static CONTROL_CHARS: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"[\x00-\x1F\x7F-\x9F\u200B-\u200D\uFEFF]").unwrap());
pub static WHITESPACE_NORMALIZE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"[\t\u00A0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]").unwrap());
pub static ZERO_WIDTH_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[\u200B-\u200D\u2060\uFEFF]").unwrap());
pub static DANGEROUS_UNICODE_PATTERN: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"[\u202A-\u202E\u2066-\u2069\u061C\u200E\u200F]").unwrap());
#[inline]
pub fn is_emoji_or_symbol(c: char) -> bool {
let code = c as u32;
(0x1F600..=0x1F64F).contains(&code) || (0x1F300..=0x1F5FF).contains(&code) || (0x1F680..=0x1F6FF).contains(&code) || (0x1F900..=0x1F9FF).contains(&code) || (0x2600..=0x26FF).contains(&code) || (0x2700..=0x27BF).contains(&code) }
#[inline]
pub fn is_emoji_or_symbol_extended(c: char) -> bool {
let code = c as u32;
is_emoji_or_symbol(c)
|| (0x1F1E0..=0x1F1FF).contains(&code) || (0x1FA00..=0x1FA6F).contains(&code) || (0x1FA70..=0x1FAFF).contains(&code) || (0x231A..=0x231B).contains(&code) || (0x23E9..=0x23F3).contains(&code) || (0x23F8..=0x23FA).contains(&code) || (0x25AA..=0x25AB).contains(&code) || code == 0x25B6 || code == 0x25C0 || (0x25FB..=0x25FE).contains(&code) || (0x2614..=0x2615).contains(&code) || (0x2648..=0x2653).contains(&code) || code == 0x267F || code == 0x2693 || code == 0x26A1 || (0x26AA..=0x26AB).contains(&code) || (0x26BD..=0x26BE).contains(&code) || (0x26C4..=0x26C5).contains(&code) || code == 0x26CE || code == 0x26D4 || code == 0x26EA || (0x26F2..=0x26F3).contains(&code) || code == 0x26F5 || code == 0x26FA || code == 0x26FD || code == 0x2702 || code == 0x2705 || (0x2708..=0x270D).contains(&code) || code == 0x270F || code == 0x2712 || code == 0x2714 || code == 0x2716 || code == 0x271D || code == 0x2721 || code == 0x2728 || (0x2733..=0x2734).contains(&code) || code == 0x2744 || code == 0x2747 || code == 0x274C || code == 0x274E || (0x2753..=0x2755).contains(&code) || code == 0x2757 || (0x2763..=0x2764).contains(&code) || (0x2795..=0x2797).contains(&code) || code == 0x27A1 || code == 0x27B0 || code == 0x27BF || (0x2934..=0x2935).contains(&code) || (0x2B05..=0x2B07).contains(&code) || (0x2B1B..=0x2B1C).contains(&code) || code == 0x2B50 || code == 0x2B55 || code == 0x3030 || code == 0x303D || code == 0x3297 || code == 0x3299 || (0xFE00..=0xFE0F).contains(&code) || code == 0x200D }
#[derive(Clone, Copy, Debug, PartialEq)]
pub enum UnicodeLetterMode {
Conservative,
Permissive,
AsciiOnly,
GitHub,
}
#[inline]
pub fn is_safe_unicode_letter(c: char, mode: UnicodeLetterMode) -> bool {
match mode {
UnicodeLetterMode::AsciiOnly => c.is_ascii_alphabetic(),
UnicodeLetterMode::Conservative => {
if c.is_ascii_alphabetic() {
return true;
}
match c as u32 {
0x00C0..=0x00D6 | 0x00D8..=0x00F6 | 0x00F8..=0x00FF => true,
0x0100..=0x017F => true,
0x1E00..=0x1EFF => true,
_ => false,
}
}
UnicodeLetterMode::Permissive => {
if c.is_ascii_alphabetic() {
return true;
}
if c.is_alphabetic() {
let code = c as u32;
if (0xE000..=0xF8FF).contains(&code) || (0xFE00..=0xFE0F).contains(&code) || (0x200B..=0x200D).contains(&code) || (0x202A..=0x202E).contains(&code)
{
return false;
}
return true;
}
false
}
UnicodeLetterMode::GitHub => {
let code = c as u32;
if (0xE000..=0xF8FF).contains(&code) || (0xF0000..=0xFFFFD).contains(&code) || (0x100000..=0x10FFFD).contains(&code) || (0xFE00..=0xFE0F).contains(&code) || (0xE0100..=0xE01EF).contains(&code)
{
return false;
}
(0x0000..=0x007F).contains(&code) || (0x0080..=0x00FF).contains(&code) || (0x0100..=0x017F).contains(&code) || (0x0180..=0x024F).contains(&code) || (0x0370..=0x03FF).contains(&code) || (0x0400..=0x04FF).contains(&code) || (0x0500..=0x052F).contains(&code) || (0x0590..=0x05FF).contains(&code) || (0x0600..=0x06FF).contains(&code) || (0x0700..=0x074F).contains(&code) || (0x0750..=0x077F).contains(&code) || (0x1100..=0x11FF).contains(&code) || (0x3040..=0x309F).contains(&code) || (0x30A0..=0x30FF).contains(&code) || (0x3130..=0x318F).contains(&code) || (0x4E00..=0x9FFF).contains(&code) || (0xAC00..=0xD7AF).contains(&code) || (0xA000..=0xA48F).contains(&code) || (0xA490..=0xA4CF).contains(&code) }
}
}
pub const MAX_INPUT_LENGTH: usize = 10240;
pub const MAX_INPUT_SIZE_LARGE: usize = 1024 * 1024;
#[inline]
pub fn truncate_at_char_boundary(input: &str, max_len: usize) -> &str {
if input.len() <= max_len {
return input;
}
for (byte_index, _) in input.char_indices() {
if byte_index >= max_len {
return &input[..byte_index];
}
}
input
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_is_emoji_or_symbol() {
assert!(is_emoji_or_symbol('😀'));
assert!(is_emoji_or_symbol('🎉'));
assert!(is_emoji_or_symbol('❤'));
assert!(!is_emoji_or_symbol('a'));
assert!(!is_emoji_or_symbol('1'));
assert!(!is_emoji_or_symbol(' '));
}
#[test]
fn test_is_emoji_or_symbol_extended() {
assert!(is_emoji_or_symbol_extended('😀'));
assert!(is_emoji_or_symbol_extended('✅')); assert!(is_emoji_or_symbol_extended('⭐'));
assert!(!is_emoji_or_symbol_extended('a'));
}
#[test]
fn test_is_safe_unicode_letter_modes() {
assert!(is_safe_unicode_letter('a', UnicodeLetterMode::AsciiOnly));
assert!(is_safe_unicode_letter('a', UnicodeLetterMode::Conservative));
assert!(is_safe_unicode_letter('a', UnicodeLetterMode::Permissive));
assert!(is_safe_unicode_letter('a', UnicodeLetterMode::GitHub));
assert!(!is_safe_unicode_letter('é', UnicodeLetterMode::AsciiOnly));
assert!(is_safe_unicode_letter('é', UnicodeLetterMode::Conservative));
assert!(is_safe_unicode_letter('é', UnicodeLetterMode::Permissive));
assert!(is_safe_unicode_letter('é', UnicodeLetterMode::GitHub));
assert!(!is_safe_unicode_letter('日', UnicodeLetterMode::AsciiOnly));
assert!(!is_safe_unicode_letter('日', UnicodeLetterMode::Conservative));
assert!(is_safe_unicode_letter('日', UnicodeLetterMode::Permissive));
assert!(is_safe_unicode_letter('日', UnicodeLetterMode::GitHub));
assert!(!is_safe_unicode_letter('α', UnicodeLetterMode::AsciiOnly));
assert!(!is_safe_unicode_letter('α', UnicodeLetterMode::Conservative));
assert!(is_safe_unicode_letter('α', UnicodeLetterMode::Permissive));
assert!(is_safe_unicode_letter('α', UnicodeLetterMode::GitHub));
}
#[test]
fn test_truncate_at_char_boundary() {
let input = "Hello, 世界!";
assert_eq!(truncate_at_char_boundary(input, 100), input);
assert_eq!(truncate_at_char_boundary(input, 5), "Hello");
let truncated = truncate_at_char_boundary(input, 8);
assert!(truncated.is_char_boundary(truncated.len()));
}
}