pomsky 0.12.0

A new regular expression language
Documentation
use core::fmt;
use std::{
    cmp::Ordering,
    collections::BTreeSet,
    ops::{Add, AddAssign, RangeInclusive},
};

use pomsky_syntax::exprs::Category;

use crate::{
    exprs::char_class::RegexCharSetItem,
    regex::{RegexProperty, RegexShorthand},
};

#[derive(Debug, Default, Eq, Clone)]
pub(crate) struct UnicodeSet {
    ranges: BTreeSet<SetRange>,
    props: Vec<RegexCharSetItem>,
}

impl From<char> for UnicodeSet {
    fn from(value: char) -> Self {
        let mut set = UnicodeSet::new();
        set.ranges.insert(SetRange::single(value as u32));
        set
    }
}

impl PartialEq for UnicodeSet {
    fn eq(&self, other: &Self) -> bool {
        self.props == other.props
            && self.ranges.len() == other.ranges.len()
            && self.ranges.iter().all(|range| {
                other
                    .ranges
                    .get(range)
                    .is_some_and(|r| r.first == range.first && r.last == range.last)
            })
    }
}

#[derive(Eq, Clone, Copy)]
pub(crate) struct SetRange {
    pub(crate) first: u32,
    pub(crate) last: u32,
}

impl SetRange {
    pub(crate) fn single(char: u32) -> Self {
        SetRange { first: char, last: char }
    }

    pub(crate) fn overlaps_with(&self, other: &SetRange) -> bool {
        !(self.first > other.last || other.first > self.last)
    }

    pub(crate) fn as_chars(self) -> (char, char) {
        (self.first.try_into().unwrap(), self.last.try_into().unwrap())
    }
}

impl PartialEq for SetRange {
    fn eq(&self, other: &SetRange) -> bool {
        self.overlaps_with(other)
    }
}

impl PartialOrd for SetRange {
    fn partial_cmp(&self, other: &SetRange) -> Option<Ordering> {
        Some(self.cmp(other))
    }
}

impl Ord for SetRange {
    fn cmp(&self, other: &Self) -> Ordering {
        if self.first > other.last {
            Ordering::Greater
        } else if other.first > self.last {
            Ordering::Less
        } else {
            Ordering::Equal
        }
    }
}

impl Add for SetRange {
    type Output = SetRange;

    fn add(self, rhs: SetRange) -> Self::Output {
        SetRange { first: self.first.min(rhs.first), last: self.last.max(rhs.last) }
    }
}

impl AddAssign for SetRange {
    fn add_assign(&mut self, rhs: SetRange) {
        *self = *self + rhs;
    }
}

impl fmt::Debug for SetRange {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        let (a, b) = self.as_chars();
        write!(f, "{a:?}..{b:?}")
    }
}

impl UnicodeSet {
    pub fn new() -> Self {
        UnicodeSet { ranges: BTreeSet::new(), props: Vec::new() }
    }

    pub fn try_into_char(&self) -> Option<char> {
        if self.ranges.len() == 1
            && self.props.is_empty()
            && let Some(range) = self.ranges.first()
            && range.first == range.last
            && range.first != b'\r' as u32
        {
            Some(range.first.try_into().unwrap())
        } else {
            None
        }
    }

    pub fn len(&self) -> usize {
        self.ranges.len() + self.props.len()
    }

    pub fn add_char(&mut self, char: char) {
        self.add(SetRange::single(char as u32));
    }

    pub fn add_range(&mut self, range: RangeInclusive<char>) {
        self.add(SetRange { first: *range.start() as u32, last: *range.end() as u32 })
    }

    pub fn add_set_range(&mut self, range: SetRange) {
        self.add(range)
    }

    pub fn add_range_unchecked(&mut self, range: RangeInclusive<char>) {
        self.ranges.insert(SetRange { first: *range.start() as u32, last: *range.end() as u32 });
    }

    fn add(&mut self, mut range_new: SetRange) {
        let lower = SetRange::single(range_new.first.saturating_sub(1));
        let upper = SetRange::single(range_new.last.saturating_add(1));

        let overlapping = self.ranges.range(lower..=upper).copied().collect::<MaxTwoArray<_>>();
        for &r in overlapping.iter() {
            range_new += r;
            self.ranges.remove(&r);
        }
        self.ranges.insert(range_new);
    }

    pub fn add_prop(&mut self, prop: RegexCharSetItem) {
        if self.props.contains(&prop) {
            return;
        }
        self.props.push(prop);
    }

    pub fn full_props(&self) -> Option<(RegexCharSetItem, RegexCharSetItem)> {
        let mut prev_items = vec![];

        for &(mut item) in &self.props {
            use RegexCharSetItem as RCS;
            use RegexProperty as RP;
            use RegexShorthand as RS;

            if let RCS::Property { negative, value: RP::Category(Category::Separator) } = item {
                item = RCS::Shorthand(if negative { RS::NotSpace } else { RS::Space });
            }

            if let Some(negated) = item.negate()
                && prev_items.contains(&negated)
            {
                return Some((negated, item));
            }

            prev_items.push(item);
        }

        None
    }

    pub fn ranges(&self) -> impl '_ + Iterator<Item = SetRange> {
        self.ranges.iter().copied()
    }

    pub fn props(&self) -> impl '_ + Iterator<Item = RegexCharSetItem> {
        self.props.iter().copied()
    }

    pub(crate) fn extend(&mut self, other: UnicodeSet) {
        for prop in other.props {
            self.add_prop(prop);
        }
        for range in other.ranges {
            self.add(range);
        }
    }

    pub(crate) fn may_intersect(&self, other: &UnicodeSet) -> bool {
        !self.props.is_empty()
            || !other.props.is_empty()
            || other.ranges.iter().any(|range| self.ranges.contains(range))
    }
}

struct MaxTwoArray<T> {
    items: [Option<T>; 2],
}

impl<T> MaxTwoArray<T> {
    fn iter(&self) -> impl Iterator<Item = &T> {
        self.items.iter().filter_map(|it| it.as_ref())
    }
}

impl<A> FromIterator<A> for MaxTwoArray<A> {
    fn from_iter<T: IntoIterator<Item = A>>(iter: T) -> Self {
        let mut iter = iter.into_iter();
        let mut res = MaxTwoArray { items: [const { None }; 2] };

        if let Some(item) = iter.next() {
            res.items[0] = Some(item);
            if let Some(item) = iter.next() {
                res.items[1] = Some(item);
                if iter.next().is_some() {
                    panic!("Unexpected iterator having more than 2 elements");
                }
            }
        }

        res
    }
}