use std::collections::HashMap;
use unicode_general_category::{get_general_category, GeneralCategory};
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
pub struct RangeSet {
ranges: Vec<(char, char)>,
}
impl Default for RangeSet {
fn default() -> Self {
Self::new()
}
}
impl RangeSet {
pub fn new() -> Self {
RangeSet { ranges: Vec::new() }
}
pub fn from_char(ch: char) -> Self {
RangeSet {
ranges: vec![(ch, ch)],
}
}
pub fn from_range(start: char, end: char) -> Self {
if start <= end {
RangeSet {
ranges: vec![(start, end)],
}
} else {
RangeSet::new()
}
}
pub fn is_empty(&self) -> bool {
self.ranges.is_empty()
}
pub fn add_char(&mut self, ch: char) {
self.add_range(ch, ch);
}
pub fn add_range(&mut self, start: char, end: char) {
if start > end {
return;
}
self.ranges.push((start, end));
self.normalize();
}
fn normalize(&mut self) {
if self.ranges.len() <= 1 {
return;
}
self.ranges.sort_by_key(|r| r.0);
let mut merged = Vec::with_capacity(self.ranges.len());
let mut current = self.ranges[0];
for &(start, end) in &self.ranges[1..] {
if start as u32 <= current.1 as u32 + 1 {
current.1 = current.1.max(end);
} else {
merged.push(current);
current = (start, end);
}
}
merged.push(current);
self.ranges = merged;
}
pub fn union(&self, other: &RangeSet) -> RangeSet {
let mut result = self.clone();
for &(start, end) in &other.ranges {
result.add_range(start, end);
}
result
}
pub fn intersection(&self, other: &RangeSet) -> RangeSet {
let mut result = RangeSet::new();
for &(a_start, a_end) in &self.ranges {
for &(b_start, b_end) in &other.ranges {
let int_start = a_start.max(b_start);
let int_end = a_end.min(b_end);
if int_start <= int_end {
result.ranges.push((int_start, int_end));
}
}
}
result.normalize();
result
}
pub fn minus(&self, other: &RangeSet) -> RangeSet {
let mut result = self.clone();
for &(sub_start, sub_end) in &other.ranges {
let mut new_ranges = Vec::new();
for &(start, end) in &result.ranges {
if sub_end < start || sub_start > end {
new_ranges.push((start, end));
} else {
if start < sub_start {
new_ranges
.push((start, char::from_u32(sub_start as u32 - 1).unwrap_or(start)));
}
if end > sub_end {
new_ranges.push((char::from_u32(sub_end as u32 + 1).unwrap_or(end), end));
}
}
}
result.ranges = new_ranges;
}
result.normalize();
result
}
pub fn contains(&self, ch: char) -> bool {
for &(start, end) in &self.ranges {
if ch >= start && ch <= end {
return true;
}
}
false
}
pub fn num_ranges(&self) -> usize {
self.ranges.len()
}
pub fn to_name(&self) -> String {
let mut parts = Vec::new();
for &(start, end) in &self.ranges {
if start == end {
parts.push(format!("{:X}", start as u32));
} else {
parts.push(format!("{:X}_{:X}", start as u32, end as u32));
}
}
format!("cc_{}", parts.join("_"))
}
pub fn to_predicate(&self) -> Box<dyn Fn(&str) -> bool + Send + Sync> {
let ranges = self.ranges.clone();
Box::new(move |s: &str| {
if s.chars().count() != 1 {
return false;
}
let ch = s.chars().next().unwrap();
for &(start, end) in &ranges {
if ch >= start && ch <= end {
return true;
}
}
false
})
}
}
fn split_charclass_content(content: &str) -> Vec<String> {
let mut elements = Vec::new();
let mut current = String::new();
let mut in_quote = false;
let mut quote_char = '"';
for ch in content.chars() {
if in_quote {
current.push(ch);
if ch == quote_char {
in_quote = false;
}
} else if ch == '"' || ch == '\'' {
in_quote = true;
quote_char = ch;
current.push(ch);
} else if ch == ';' || ch == ',' || ch == '|' {
let trimmed = current.trim().to_string();
if !trimmed.is_empty() {
elements.push(trimmed);
}
current = String::new();
} else {
current.push(ch);
}
}
let trimmed = current.trim().to_string();
if !trimmed.is_empty() {
elements.push(trimmed);
}
elements
}
pub fn unicode_category_to_rangeset(category_name: &str) -> Option<RangeSet> {
use std::sync::{Mutex, OnceLock};
static UNICODE_CACHE: OnceLock<Mutex<HashMap<String, RangeSet>>> = OnceLock::new();
let cache = UNICODE_CACHE.get_or_init(|| Mutex::new(HashMap::new()));
{
let cache_lock = cache.lock().unwrap();
if let Some(rangeset) = cache_lock.get(category_name) {
return Some(rangeset.clone());
}
}
let mut result = RangeSet::new();
let is_major = matches!(category_name, "L" | "M" | "N" | "P" | "S" | "Z" | "C");
let is_minor = matches!(
category_name,
"Lu" | "Ll"
| "Lt"
| "Lm"
| "Lo"
| "LC"
| "Mn"
| "Mc"
| "Me"
| "Nd"
| "Nl"
| "No"
| "Pc"
| "Pd"
| "Ps"
| "Pe"
| "Pi"
| "Pf"
| "Po"
| "Sm"
| "Sc"
| "Sk"
| "So"
| "Zs"
| "Zl"
| "Zp"
| "Cc"
| "Cf"
| "Cs"
| "Co"
| "Cn"
);
if !is_major && !is_minor {
return None;
}
let matches_category = |cat: GeneralCategory, name: &str| -> bool {
match name {
"L" => matches!(
cat,
GeneralCategory::UppercaseLetter
| GeneralCategory::LowercaseLetter
| GeneralCategory::TitlecaseLetter
| GeneralCategory::ModifierLetter
| GeneralCategory::OtherLetter
),
"LC" => matches!(
cat,
GeneralCategory::UppercaseLetter
| GeneralCategory::LowercaseLetter
| GeneralCategory::TitlecaseLetter
),
"M" => matches!(
cat,
GeneralCategory::NonspacingMark
| GeneralCategory::SpacingMark
| GeneralCategory::EnclosingMark
),
"N" => matches!(
cat,
GeneralCategory::DecimalNumber
| GeneralCategory::LetterNumber
| GeneralCategory::OtherNumber
),
"P" => matches!(
cat,
GeneralCategory::ConnectorPunctuation
| GeneralCategory::DashPunctuation
| GeneralCategory::OpenPunctuation
| GeneralCategory::ClosePunctuation
| GeneralCategory::InitialPunctuation
| GeneralCategory::FinalPunctuation
| GeneralCategory::OtherPunctuation
),
"S" => matches!(
cat,
GeneralCategory::MathSymbol
| GeneralCategory::CurrencySymbol
| GeneralCategory::ModifierSymbol
| GeneralCategory::OtherSymbol
),
"Z" => matches!(
cat,
GeneralCategory::SpaceSeparator
| GeneralCategory::LineSeparator
| GeneralCategory::ParagraphSeparator
),
"C" => matches!(
cat,
GeneralCategory::Control
| GeneralCategory::Format
| GeneralCategory::Surrogate
| GeneralCategory::PrivateUse
| GeneralCategory::Unassigned
),
"Lu" => cat == GeneralCategory::UppercaseLetter,
"Ll" => cat == GeneralCategory::LowercaseLetter,
"Lt" => cat == GeneralCategory::TitlecaseLetter,
"Lm" => cat == GeneralCategory::ModifierLetter,
"Lo" => cat == GeneralCategory::OtherLetter,
"Mn" => cat == GeneralCategory::NonspacingMark,
"Mc" => cat == GeneralCategory::SpacingMark,
"Me" => cat == GeneralCategory::EnclosingMark,
"Nd" => cat == GeneralCategory::DecimalNumber,
"Nl" => cat == GeneralCategory::LetterNumber,
"No" => cat == GeneralCategory::OtherNumber,
"Pc" => cat == GeneralCategory::ConnectorPunctuation,
"Pd" => cat == GeneralCategory::DashPunctuation,
"Ps" => cat == GeneralCategory::OpenPunctuation,
"Pe" => cat == GeneralCategory::ClosePunctuation,
"Pi" => cat == GeneralCategory::InitialPunctuation,
"Pf" => cat == GeneralCategory::FinalPunctuation,
"Po" => cat == GeneralCategory::OtherPunctuation,
"Sm" => cat == GeneralCategory::MathSymbol,
"Sc" => cat == GeneralCategory::CurrencySymbol,
"Sk" => cat == GeneralCategory::ModifierSymbol,
"So" => cat == GeneralCategory::OtherSymbol,
"Zs" => cat == GeneralCategory::SpaceSeparator,
"Zl" => cat == GeneralCategory::LineSeparator,
"Zp" => cat == GeneralCategory::ParagraphSeparator,
"Cc" => cat == GeneralCategory::Control,
"Cf" => cat == GeneralCategory::Format,
"Cs" => cat == GeneralCategory::Surrogate,
"Co" => cat == GeneralCategory::PrivateUse,
"Cn" => cat == GeneralCategory::Unassigned,
_ => false,
}
};
let mut range_start: Option<char> = None;
let mut prev_char: Option<char> = None;
for codepoint in 0u32..=0x10FFFF {
if let Some(ch) = char::from_u32(codepoint) {
let cat = get_general_category(ch);
let mut is_match = matches_category(cat, category_name);
if is_match && matches!(category_name, "Cc" | "C") && (ch == '\n' || ch == '\r') {
is_match = false;
}
if is_match {
if range_start.is_none() {
range_start = Some(ch);
}
prev_char = Some(ch);
} else {
if let (Some(start), Some(end)) = (range_start, prev_char) {
result.add_range(start, end);
}
range_start = None;
prev_char = None;
}
}
}
if let (Some(start), Some(end)) = (range_start, prev_char) {
result.add_range(start, end);
}
{
let mut cache_lock = cache.lock().unwrap();
cache_lock.insert(category_name.to_string(), result.clone());
}
Some(result)
}
pub fn charclass_to_rangeset(content: &str) -> RangeSet {
let mut result = RangeSet::new();
let elements = split_charclass_content(content);
for element in elements {
let element = element.trim();
if element.is_empty() {
continue;
}
if element.starts_with('#') && element.contains('-') {
if let Some(dash_pos) = element[1..].find('-') {
let actual_dash_pos = dash_pos + 1;
let start_part = &element[..actual_dash_pos];
let end_part = &element[actual_dash_pos + 1..];
if end_part.starts_with('#') {
if let (Some(start), Some(end)) =
(parse_hex_char(start_part), parse_hex_char(end_part))
{
result.add_range(start, end);
continue;
}
} else if end_part.starts_with('"') || end_part.starts_with('\'') {
let quote = if end_part.starts_with('"') { '"' } else { '\'' };
if let Some(close_pos) = end_part[1..].find(quote) {
let end_str = &end_part[1..close_pos + 1];
let end_char = end_str.chars().next();
if let (Some(start), Some(end)) = (parse_hex_char(start_part), end_char) {
result.add_range(start, end);
continue;
}
}
}
}
if let Some(ch) = parse_hex_char(element) {
result.add_char(ch);
}
}
else if (element.starts_with('\'') || element.starts_with('"')) && element.contains('-') {
let quote = if element.starts_with('\'') { '\'' } else { '"' };
if let Some(first_close) = element[1..].find(quote) {
let first_close = first_close + 1;
let after_close = &element[first_close + 1..];
if after_close.starts_with('-') && after_close.len() > 1 {
let after_dash = &after_close[1..];
if after_dash.starts_with('\'') || after_dash.starts_with('"') {
let start_str = &element[1..first_close];
let start_char = start_str.chars().next();
let end_quote = if after_dash.starts_with('\'') {
'\''
} else {
'"'
};
if let Some(end_close) = after_dash[1..].find(end_quote) {
let end_str = &after_dash[1..end_close + 1];
let end_char = end_str.chars().next();
if let (Some(start), Some(end)) = (start_char, end_char) {
result.add_range(start, end);
continue;
}
}
}
}
}
let inner = if element.starts_with('\'') {
element.trim_matches('\'')
} else {
element.trim_matches('"')
};
for ch in inner.chars() {
result.add_char(ch);
}
}
else if element.starts_with('#') {
if let Some(ch) = parse_hex_char(element) {
result.add_char(ch);
}
}
else if (element.starts_with('\'') && element.ends_with('\''))
|| (element.starts_with('"') && element.ends_with('"'))
{
let inner = if element.starts_with('\'') {
element.trim_matches('\'')
} else {
element.trim_matches('"')
};
for ch in inner.chars() {
result.add_char(ch);
}
}
else if let Some(category_rangeset) = unicode_category_to_rangeset(element) {
result = result.union(&category_rangeset);
}
}
result
}
fn parse_hex_char(s: &str) -> Option<char> {
if !s.starts_with('#') {
return None;
}
let hex_part = &s[1..];
if let Ok(code_point) = u32::from_str_radix(hex_part, 16) {
char::from_u32(code_point)
} else {
None
}
}