use std::{borrow::Cow, cmp::Ordering};
use crate::{
alternation::RegexAlternation,
char_class::{RegexCharClass, RegexClassItem},
compile::CompileResult,
error::{CompileErrorKind, ParseError, ParseErrorKind},
features::RulexFeatures,
group::{RegexCapture, RegexGroup},
options::ParseOptions,
regex::Regex,
repetition::{RegexQuantifier, RegexRepetition, RepetitionKind},
span::Span,
};
#[derive(Clone, PartialEq, Eq)]
pub(crate) struct Range {
start: Vec<u8>,
end: Vec<u8>,
radix: u8,
pub(crate) span: Span,
}
impl Range {
pub(crate) fn new(start: Vec<u8>, end: Vec<u8>, radix: u8, span: Span) -> Self {
Range { start, end, radix, span }
}
pub(crate) fn compile(&self) -> CompileResult<'static> {
match range(&self.start, &self.end, true, self.radix) {
Ok(rule) => Ok(rule.to_regex()),
Err(Error) => {
Err(CompileErrorKind::Other("Expanding the range yielded an unexpected error")
.at(self.span))
}
}
}
pub(crate) fn validate(&self, options: &ParseOptions) -> Result<(), ParseError> {
if self.end.len() > options.max_range_size as usize {
return Err(ParseErrorKind::RangeIsTooBig(options.max_range_size).at(self.span));
}
options.allowed_features.require(RulexFeatures::RANGES, self.span)
}
}
#[cfg(feature = "dbg")]
impl std::fmt::Debug for Range {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
fn hex(n: u8) -> char {
match n {
0..=9 => (n + b'0') as char,
_ => (n + (b'A' - 10)) as char,
}
}
write!(
f,
"Range (base {}): {}-{}",
self.radix,
self.start.iter().map(|&n| hex(n)).collect::<String>(),
self.end.iter().map(|&n| hex(n)).collect::<String>(),
)
}
}
fn range(a: &[u8], b: &[u8], is_first: bool, radix: u8) -> Result<Rule, Error> {
let hi_digit = radix - 1;
let lo_digit = if is_first { 1 } else { 0 };
debug_assert!(a.len() <= b.len() && (a.len() < b.len() || a <= b));
Ok(match (a.split_first(), b.split_first()) {
(None, None) => Rule::Empty,
(Some(_), None) => return Err(Error),
(None, Some(_)) => range(&[0], b, false, radix)?.optional(),
(Some((&ax, [])), Some((&bx, []))) => Rule::class(ax, bx),
(Some((&ax, a_rest)), Some((&bx, b_rest))) => {
let (min, max) = (u8::min(ax, bx), u8::max(ax, bx));
let mut alternatives = vec![];
if min > lo_digit && a_rest.len() < b_rest.len() {
alternatives.push(vec![
Rule::class(lo_digit, min - 1),
Rule::class(0, hi_digit).repeat(a_rest.len() + 1, b_rest.len()),
]);
}
match ax.cmp(&bx) {
Ordering::Equal => {
alternatives
.push(vec![Rule::class(ax, bx), range(a_rest, b_rest, false, radix)?]);
}
Ordering::Less => {
if is_first && ax == 0 && a_rest.is_empty() {
alternatives.push(vec![Rule::class(0, 0)]);
} else {
alternatives.push(vec![
Rule::class(min, min),
range(a_rest, &vec![hi_digit; b_rest.len()], false, radix)?,
]);
}
if max - min >= 2 {
alternatives.push(vec![
Rule::class(min + 1, max - 1),
Rule::class(0, hi_digit).repeat(a_rest.len(), b_rest.len()),
]);
}
alternatives.push(vec![
Rule::class(max, max),
range(&vec![0; a_rest.len()], b_rest, false, radix)?,
]);
}
Ordering::Greater => {
alternatives.push(vec![
Rule::class(min, min),
range(&vec![0; a.len()], b_rest, false, radix)?,
]);
if max - min >= 2 && a_rest.len() + 2 <= b_rest.len() {
alternatives.push(vec![
Rule::class(min + 1, max - 1),
Rule::class(0, hi_digit).repeat(a_rest.len() + 1, b_rest.len() - 1),
]);
}
alternatives.push(vec![
Rule::class(max, max),
range(a_rest, &vec![hi_digit; b_rest.len() - 1], false, radix)?,
]);
}
}
if max < hi_digit && a_rest.len() < b_rest.len() {
alternatives.push(vec![
Rule::class(max + 1, hi_digit),
Rule::class(0, hi_digit).repeat(a_rest.len(), b_rest.len() - 1),
]);
}
merge_and_optimize_alternatives(alternatives)
}
})
}
fn merge_and_optimize_alternatives(alternatives: Vec<Vec<Rule>>) -> Rule {
let capacity = alternatives.len();
let mut alternatives = alternatives.into_iter().fold(
Vec::with_capacity(capacity),
|mut acc: Vec<Vec<Rule>>, mut rules| {
if let [this1, this2] = rules.as_slice() {
if this1 == this2 {
let rule = rules.pop().unwrap();
rules[0] = rule.repeat(2, 2);
} else if *this2 == Rule::Empty {
rules.pop();
}
}
if let Some(last) = acc.last_mut() {
if let [Rule::Class(prev_class), prev] = last.as_mut_slice() {
if let [Rule::Class(this_class), ref mut this2] = rules.as_mut_slice() {
if prev == this2 {
debug_assert!(prev_class.end + 1 == this_class.start);
prev_class.end = this_class.end;
if let [last1, last2] = last.as_slice() {
if last1 == last2 {
let rule = last.pop().unwrap();
last[0] = rule.repeat(2, 2);
} else if let Rule::Repeat(r) = last2 {
if r.rule == *last1 {
let (min, max) = (r.min, r.max);
let _ = last.pop();
let rule = last.pop().unwrap();
last.push(rule.repeat(min + 1, max + 1));
}
}
}
return acc;
}
}
}
}
acc.push(rules);
acc
},
);
if alternatives.len() == 1 && alternatives[0].len() == 1 {
alternatives.pop().unwrap().pop().unwrap()
} else {
Rule::alt(alternatives)
}
}
struct Error;
#[derive(PartialEq, Eq)]
enum Rule {
Empty,
Class(Class),
Repeat(Box<Repeat>),
Alt(Alt),
}
#[cfg(FALSE)]
impl std::fmt::Debug for Rule {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Empty => write!(f, "Empty"),
Self::Class(Class { start, end }) => write!(f, "[{start}-{end}]"),
Self::Repeat(r) => {
if f.alternate() {
write!(f, "{:#?}{{{}, {}}}", r.rule, r.min, r.max)
} else {
write!(f, "{:?}{{{}, {}}}", r.rule, r.min, r.max)
}
}
Self::Alt(arg0) => {
let mut d = f.debug_tuple("Alt");
let mut d = &mut d;
for item in &arg0.0 {
if item.iter().map(|n| n.debug_size()).sum::<u64>() < 5 {
d = d.field(&format_args!(
"{}",
item.iter().map(|n| format!("{n:?} ")).collect::<String>().trim()
))
} else {
d = d.field(item);
}
}
d.finish()
}
}
}
}
impl Rule {
fn class(start: u8, end: u8) -> Self {
Rule::Class(Class { start, end })
}
fn repeat(self, min: usize, max: usize) -> Self {
debug_assert!(min <= max);
if max == 0 {
Rule::Empty
} else if max == 1 && min == 1 {
self
} else {
Rule::Repeat(Box::new(Repeat { rule: self, min, max }))
}
}
fn alt(alts: Vec<Vec<Rule>>) -> Self {
Rule::Alt(Alt(alts))
}
fn optional(self) -> Rule {
match self {
Rule::Repeat(mut repeat) if repeat.min <= 1 => {
repeat.min = 0;
Rule::Repeat(repeat)
}
rule => rule.repeat(0, 1),
}
}
fn to_regex(&self) -> Regex<'static> {
match self {
Rule::Empty => Regex::Literal(Cow::Borrowed("")),
Rule::Class(c) => c.to_regex(),
Rule::Repeat(r) => r.to_regex(),
Rule::Alt(a) => a.to_regex(),
}
}
#[cfg(FALSE)]
fn debug_size(&self) -> u64 {
match self {
Rule::Empty => 1,
Rule::Class(_) => 1,
Rule::Repeat(r) => 1 + r.rule.debug_size(),
Rule::Alt(_) => 10,
}
}
}
#[derive(PartialEq, Eq, Clone, Copy)]
struct Class {
start: u8,
end: u8,
}
#[derive(PartialEq, Eq)]
struct Repeat {
rule: Rule,
min: usize,
max: usize,
}
impl Repeat {
fn to_regex(&self) -> Regex<'static> {
Regex::Repetition(Box::new(RegexRepetition::new(
self.rule.to_regex(),
RepetitionKind::try_from((self.min as u32, Some(self.max as u32))).unwrap(),
RegexQuantifier::Greedy,
)))
}
}
#[derive(PartialEq, Eq)]
struct Alt(Vec<Vec<Rule>>);
impl Alt {
fn to_regex(&self) -> Regex<'static> {
Regex::Alternation(RegexAlternation::new(
self.0
.iter()
.map(|v| {
Regex::Group(RegexGroup::new(
v.iter().map(|r| r.to_regex()).collect(),
RegexCapture::None,
))
})
.collect(),
))
}
}
impl Class {
fn to_regex(self) -> Regex<'static> {
let (a, b) = (self.start, self.end);
Regex::CharClass(RegexCharClass::new(match (a, b, a == b) {
(0..=9, _, true) => return Regex::Char((a + b'0') as char),
(0..=9, 0..=9, _) => {
vec![RegexClassItem::range_unchecked((a + b'0') as char, (b + b'0') as char)]
}
(10.., _, true) => vec![
RegexClassItem::Char((a + b'a' - 10) as char),
RegexClassItem::Char((a + b'A' - 10) as char),
],
(10.., 10.., _) => vec![
RegexClassItem::range_unchecked((a + b'a' - 10) as char, (b + b'a' - 10) as char),
RegexClassItem::range_unchecked((a + b'A' - 10) as char, (b + b'A' - 10) as char),
],
(9, 10, _) => vec![
RegexClassItem::Char('9'),
RegexClassItem::Char('a'),
RegexClassItem::Char('A'),
],
(_, 10, _) => vec![
RegexClassItem::range_unchecked((a + b'0') as char, '9'),
RegexClassItem::Char('a'),
RegexClassItem::Char('A'),
],
(9, _, _) => vec![
RegexClassItem::Char('9'),
RegexClassItem::range_unchecked('a', (b + b'a' - 10) as char),
RegexClassItem::range_unchecked('A', (b + b'A' - 10) as char),
],
_ => vec![
RegexClassItem::range_unchecked((a + b'0') as char, '9'),
RegexClassItem::range_unchecked('a', (b + b'a' - 10) as char),
RegexClassItem::range_unchecked('A', (b + b'A' - 10) as char),
],
}))
}
}