use core::{cmp, mem, num::NonZeroUsize};
use alloc::{vec, vec::Vec};
use crate::hir::{self, Hir};
#[derive(Clone, Debug)]
pub struct Extractor {
kind: ExtractKind,
limit_class: usize,
limit_repeat: usize,
limit_literal_len: usize,
limit_total: usize,
}
impl Extractor {
pub fn new() -> Extractor {
Extractor {
kind: ExtractKind::Prefix,
limit_class: 10,
limit_repeat: 10,
limit_literal_len: 100,
limit_total: 250,
}
}
pub fn extract(&self, hir: &Hir) -> Seq {
use crate::hir::HirKind::*;
match *hir.kind() {
Empty | Look(_) => Seq::singleton(self::Literal::exact(vec![])),
Literal(hir::Literal(ref bytes)) => {
let mut seq =
Seq::singleton(self::Literal::exact(bytes.to_vec()));
self.enforce_literal_len(&mut seq);
seq
}
Class(hir::Class::Unicode(ref cls)) => {
self.extract_class_unicode(cls)
}
Class(hir::Class::Bytes(ref cls)) => self.extract_class_bytes(cls),
Repetition(ref rep) => self.extract_repetition(rep),
Capture(hir::Capture { ref sub, .. }) => self.extract(sub),
Concat(ref hirs) => match self.kind {
ExtractKind::Prefix => self.extract_concat(hirs.iter()),
ExtractKind::Suffix => self.extract_concat(hirs.iter().rev()),
},
Alternation(ref hirs) => {
self.extract_alternation(hirs.iter())
}
}
}
pub fn kind(&mut self, kind: ExtractKind) -> &mut Extractor {
self.kind = kind;
self
}
pub fn limit_class(&mut self, limit: usize) -> &mut Extractor {
self.limit_class = limit;
self
}
pub fn limit_repeat(&mut self, limit: usize) -> &mut Extractor {
self.limit_repeat = limit;
self
}
pub fn limit_literal_len(&mut self, limit: usize) -> &mut Extractor {
self.limit_literal_len = limit;
self
}
pub fn limit_total(&mut self, limit: usize) -> &mut Extractor {
self.limit_total = limit;
self
}
fn extract_concat<'a, I: Iterator<Item = &'a Hir>>(&self, it: I) -> Seq {
let mut seq = Seq::singleton(self::Literal::exact(vec![]));
for hir in it {
if seq.is_inexact() {
break;
}
seq = self.cross(seq, &mut self.extract(hir));
}
seq
}
fn extract_alternation<'a, I: Iterator<Item = &'a Hir>>(
&self,
it: I,
) -> Seq {
let mut seq = Seq::empty();
for hir in it {
if !seq.is_finite() {
break;
}
seq = self.union(seq, &mut self.extract(hir));
}
seq
}
fn extract_repetition(&self, rep: &hir::Repetition) -> Seq {
let mut subseq = self.extract(&rep.sub);
match *rep {
hir::Repetition { min: 0, max, greedy, .. } => {
if max != Some(1) {
subseq.make_inexact();
}
let mut empty = Seq::singleton(Literal::exact(vec![]));
if !greedy {
mem::swap(&mut subseq, &mut empty);
}
self.union(subseq, &mut empty)
}
hir::Repetition { min, max: Some(max), .. } if min == max => {
assert!(min > 0); let limit =
u32::try_from(self.limit_repeat).unwrap_or(u32::MAX);
let mut seq = Seq::singleton(Literal::exact(vec![]));
for _ in 0..cmp::min(min, limit) {
if seq.is_inexact() {
break;
}
seq = self.cross(seq, &mut subseq.clone());
}
if usize::try_from(min).is_err() || min > limit {
seq.make_inexact();
}
seq
}
hir::Repetition { min, .. } => {
assert!(min > 0); let limit =
u32::try_from(self.limit_repeat).unwrap_or(u32::MAX);
let mut seq = Seq::singleton(Literal::exact(vec![]));
for _ in 0..cmp::min(min, limit) {
if seq.is_inexact() {
break;
}
seq = self.cross(seq, &mut subseq.clone());
}
seq.make_inexact();
seq
}
}
}
fn extract_class_unicode(&self, cls: &hir::ClassUnicode) -> Seq {
if self.class_over_limit_unicode(cls) {
return Seq::infinite();
}
let mut seq = Seq::empty();
for r in cls.iter() {
for ch in r.start()..=r.end() {
seq.push(Literal::from(ch));
}
}
self.enforce_literal_len(&mut seq);
seq
}
fn extract_class_bytes(&self, cls: &hir::ClassBytes) -> Seq {
if self.class_over_limit_bytes(cls) {
return Seq::infinite();
}
let mut seq = Seq::empty();
for r in cls.iter() {
for b in r.start()..=r.end() {
seq.push(Literal::from(b));
}
}
self.enforce_literal_len(&mut seq);
seq
}
fn class_over_limit_unicode(&self, cls: &hir::ClassUnicode) -> bool {
let mut count = 0;
for r in cls.iter() {
if count > self.limit_class {
return true;
}
count += r.len();
}
count > self.limit_class
}
fn class_over_limit_bytes(&self, cls: &hir::ClassBytes) -> bool {
let mut count = 0;
for r in cls.iter() {
if count > self.limit_class {
return true;
}
count += r.len();
}
count > self.limit_class
}
fn cross(&self, mut seq1: Seq, seq2: &mut Seq) -> Seq {
if seq1.max_cross_len(seq2).map_or(false, |len| len > self.limit_total)
{
seq2.make_infinite();
}
if let ExtractKind::Suffix = self.kind {
seq1.cross_reverse(seq2);
} else {
seq1.cross_forward(seq2);
}
assert!(seq1.len().map_or(true, |x| x <= self.limit_total));
self.enforce_literal_len(&mut seq1);
seq1
}
fn union(&self, mut seq1: Seq, seq2: &mut Seq) -> Seq {
if seq1.max_union_len(seq2).map_or(false, |len| len > self.limit_total)
{
match self.kind {
ExtractKind::Prefix => {
seq1.keep_first_bytes(4);
seq2.keep_first_bytes(4);
}
ExtractKind::Suffix => {
seq1.keep_last_bytes(4);
seq2.keep_last_bytes(4);
}
}
seq1.dedup();
seq2.dedup();
if seq1
.max_union_len(seq2)
.map_or(false, |len| len > self.limit_total)
{
seq2.make_infinite();
}
}
seq1.union(seq2);
assert!(seq1.len().map_or(true, |x| x <= self.limit_total));
seq1
}
fn enforce_literal_len(&self, seq: &mut Seq) {
let len = self.limit_literal_len;
match self.kind {
ExtractKind::Prefix => seq.keep_first_bytes(len),
ExtractKind::Suffix => seq.keep_last_bytes(len),
}
}
}
impl Default for Extractor {
fn default() -> Extractor {
Extractor::new()
}
}
#[non_exhaustive]
#[derive(Clone, Debug)]
pub enum ExtractKind {
Prefix,
Suffix,
}
impl ExtractKind {
pub fn is_prefix(&self) -> bool {
matches!(*self, ExtractKind::Prefix)
}
pub fn is_suffix(&self) -> bool {
matches!(*self, ExtractKind::Suffix)
}
}
impl Default for ExtractKind {
fn default() -> ExtractKind {
ExtractKind::Prefix
}
}
#[derive(Clone, Eq, PartialEq)]
pub struct Seq {
literals: Option<Vec<Literal>>,
}
impl Seq {
#[inline]
pub fn empty() -> Seq {
Seq { literals: Some(vec![]) }
}
#[inline]
pub fn infinite() -> Seq {
Seq { literals: None }
}
#[inline]
pub fn singleton(lit: Literal) -> Seq {
Seq { literals: Some(vec![lit]) }
}
#[inline]
pub fn new<I, B>(it: I) -> Seq
where
I: IntoIterator<Item = B>,
B: AsRef<[u8]>,
{
it.into_iter().map(|b| Literal::exact(b.as_ref())).collect()
}
#[inline]
pub fn literals(&self) -> Option<&[Literal]> {
self.literals.as_deref()
}
#[inline]
pub fn push(&mut self, lit: Literal) {
let lits = match self.literals {
None => return,
Some(ref mut lits) => lits,
};
if lits.last().map_or(false, |m| m == &lit) {
return;
}
lits.push(lit);
}
#[inline]
pub fn make_inexact(&mut self) {
let lits = match self.literals {
None => return,
Some(ref mut lits) => lits,
};
for lit in lits.iter_mut() {
lit.make_inexact();
}
}
#[inline]
pub fn make_infinite(&mut self) {
self.literals = None;
}
#[inline]
pub fn cross_forward(&mut self, other: &mut Seq) {
let (lits1, lits2) = match self.cross_preamble(other) {
None => return,
Some((lits1, lits2)) => (lits1, lits2),
};
let newcap = lits1.len().saturating_mul(lits2.len());
for selflit in mem::replace(lits1, Vec::with_capacity(newcap)) {
if !selflit.is_exact() {
lits1.push(selflit);
continue;
}
for otherlit in lits2.iter() {
let mut newlit = Literal::exact(Vec::with_capacity(
selflit.len() + otherlit.len(),
));
newlit.extend(&selflit);
newlit.extend(&otherlit);
if !otherlit.is_exact() {
newlit.make_inexact();
}
lits1.push(newlit);
}
}
lits2.drain(..);
self.dedup();
}
#[inline]
pub fn cross_reverse(&mut self, other: &mut Seq) {
let (lits1, lits2) = match self.cross_preamble(other) {
None => return,
Some((lits1, lits2)) => (lits1, lits2),
};
let newcap = lits1.len().saturating_mul(lits2.len());
let selflits = mem::replace(lits1, Vec::with_capacity(newcap));
for (i, otherlit) in lits2.drain(..).enumerate() {
for selflit in selflits.iter() {
if !selflit.is_exact() {
if i == 0 {
lits1.push(selflit.clone());
}
continue;
}
let mut newlit = Literal::exact(Vec::with_capacity(
otherlit.len() + selflit.len(),
));
newlit.extend(&otherlit);
newlit.extend(&selflit);
if !otherlit.is_exact() {
newlit.make_inexact();
}
lits1.push(newlit);
}
}
self.dedup();
}
fn cross_preamble<'a>(
&'a mut self,
other: &'a mut Seq,
) -> Option<(&'a mut Vec<Literal>, &'a mut Vec<Literal>)> {
let lits2 = match other.literals {
None => {
if self.min_literal_len() == Some(0) {
*self = Seq::infinite();
} else {
self.make_inexact();
}
return None;
}
Some(ref mut lits) => lits,
};
let lits1 = match self.literals {
None => {
lits2.drain(..);
return None;
}
Some(ref mut lits) => lits,
};
Some((lits1, lits2))
}
#[inline]
pub fn union(&mut self, other: &mut Seq) {
let lits2 = match other.literals {
None => {
self.make_infinite();
return;
}
Some(ref mut lits) => lits.drain(..),
};
let lits1 = match self.literals {
None => return,
Some(ref mut lits) => lits,
};
lits1.extend(lits2);
self.dedup();
}
#[inline]
pub fn union_into_empty(&mut self, other: &mut Seq) {
let lits2 = other.literals.as_mut().map(|lits| lits.drain(..));
let lits1 = match self.literals {
None => return,
Some(ref mut lits) => lits,
};
let first_empty = match lits1.iter().position(|m| m.is_empty()) {
None => return,
Some(i) => i,
};
let lits2 = match lits2 {
None => {
self.literals = None;
return;
}
Some(lits) => lits,
};
lits1.retain(|m| !m.is_empty());
lits1.splice(first_empty..first_empty, lits2);
self.dedup();
}
#[inline]
pub fn dedup(&mut self) {
if let Some(ref mut lits) = self.literals {
lits.dedup_by(|lit1, lit2| {
if lit1.as_bytes() != lit2.as_bytes() {
return false;
}
if lit1.is_exact() != lit2.is_exact() {
lit1.make_inexact();
lit2.make_inexact();
}
true
});
}
}
#[inline]
pub fn sort(&mut self) {
if let Some(ref mut lits) = self.literals {
lits.sort();
}
}
#[inline]
pub fn reverse_literals(&mut self) {
if let Some(ref mut lits) = self.literals {
for lit in lits.iter_mut() {
lit.reverse();
}
}
}
#[inline]
pub fn minimize_by_preference(&mut self) {
if let Some(ref mut lits) = self.literals {
PreferenceTrie::minimize(lits, false);
}
}
#[inline]
pub fn keep_first_bytes(&mut self, len: usize) {
if let Some(ref mut lits) = self.literals {
for m in lits.iter_mut() {
m.keep_first_bytes(len);
}
}
}
#[inline]
pub fn keep_last_bytes(&mut self, len: usize) {
if let Some(ref mut lits) = self.literals {
for m in lits.iter_mut() {
m.keep_last_bytes(len);
}
}
}
#[inline]
pub fn is_finite(&self) -> bool {
self.literals.is_some()
}
#[inline]
pub fn is_empty(&self) -> bool {
self.len() == Some(0)
}
#[inline]
pub fn len(&self) -> Option<usize> {
self.literals.as_ref().map(|lits| lits.len())
}
#[inline]
pub fn is_exact(&self) -> bool {
self.literals().map_or(false, |lits| lits.iter().all(|x| x.is_exact()))
}
#[inline]
pub fn is_inexact(&self) -> bool {
self.literals().map_or(true, |lits| lits.iter().all(|x| !x.is_exact()))
}
#[inline]
pub fn max_union_len(&self, other: &Seq) -> Option<usize> {
let len1 = self.len()?;
let len2 = other.len()?;
Some(len1.saturating_add(len2))
}
#[inline]
pub fn max_cross_len(&self, other: &Seq) -> Option<usize> {
let len1 = self.len()?;
let len2 = other.len()?;
Some(len1.saturating_mul(len2))
}
#[inline]
pub fn min_literal_len(&self) -> Option<usize> {
self.literals.as_ref()?.iter().map(|x| x.len()).min()
}
#[inline]
pub fn max_literal_len(&self) -> Option<usize> {
self.literals.as_ref()?.iter().map(|x| x.len()).max()
}
#[inline]
pub fn longest_common_prefix(&self) -> Option<&[u8]> {
let lits = match self.literals {
None => return None,
Some(ref lits) => lits,
};
if lits.len() == 0 {
return None;
}
let base = lits[0].as_bytes();
let mut len = base.len();
for m in lits.iter().skip(1) {
len = m
.as_bytes()
.iter()
.zip(base[..len].iter())
.take_while(|&(a, b)| a == b)
.count();
if len == 0 {
return Some(&[]);
}
}
Some(&base[..len])
}
#[inline]
pub fn longest_common_suffix(&self) -> Option<&[u8]> {
let lits = match self.literals {
None => return None,
Some(ref lits) => lits,
};
if lits.len() == 0 {
return None;
}
let base = lits[0].as_bytes();
let mut len = base.len();
for m in lits.iter().skip(1) {
len = m
.as_bytes()
.iter()
.rev()
.zip(base[base.len() - len..].iter().rev())
.take_while(|&(a, b)| a == b)
.count();
if len == 0 {
return Some(&[]);
}
}
Some(&base[base.len() - len..])
}
#[inline]
pub fn optimize_for_prefix_by_preference(&mut self) {
self.optimize_by_preference(true);
}
#[inline]
pub fn optimize_for_suffix_by_preference(&mut self) {
self.optimize_by_preference(false);
}
fn optimize_by_preference(&mut self, prefix: bool) {
let origlen = match self.len() {
None => return,
Some(len) => len,
};
if self.min_literal_len().map_or(false, |len| len == 0) {
self.make_infinite();
return;
}
if prefix {
if let Some(ref mut lits) = self.literals {
PreferenceTrie::minimize(lits, true);
}
}
let fix = if prefix {
self.longest_common_prefix()
} else {
self.longest_common_suffix()
};
if let Some(fix) = fix {
if prefix
&& origlen > 1
&& fix.len() >= 1
&& fix.len() <= 3
&& rank(fix[0]) < 200
{
self.keep_first_bytes(1);
self.dedup();
return;
}
let isfast =
self.is_exact() && self.len().map_or(false, |len| len <= 16);
let usefix = fix.len() > 4 || (fix.len() > 1 && !isfast);
if usefix {
if prefix {
self.keep_first_bytes(fix.len());
} else {
self.keep_last_bytes(fix.len());
}
self.dedup();
assert_eq!(Some(1), self.len());
}
}
let exact: Option<Seq> =
if self.is_exact() { Some(self.clone()) } else { None };
const ATTEMPTS: [(usize, usize); 5] =
[(5, 10), (4, 10), (3, 64), (2, 64), (1, 10)];
for (keep, limit) in ATTEMPTS {
let len = match self.len() {
None => break,
Some(len) => len,
};
if len <= limit {
break;
}
if prefix {
self.keep_first_bytes(keep);
} else {
self.keep_last_bytes(keep);
}
if prefix {
if let Some(ref mut lits) = self.literals {
PreferenceTrie::minimize(lits, true);
}
}
}
if let Some(lits) = self.literals() {
if lits.iter().any(|lit| lit.is_poisonous()) {
self.make_infinite();
}
}
if let Some(exact) = exact {
if !self.is_finite() {
*self = exact;
return;
}
if self.min_literal_len().map_or(true, |len| len <= 2) {
*self = exact;
return;
}
if self.len().map_or(true, |len| len > 64) {
*self = exact;
return;
}
}
}
}
impl core::fmt::Debug for Seq {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
write!(f, "Seq")?;
if let Some(lits) = self.literals() {
f.debug_list().entries(lits.iter()).finish()
} else {
write!(f, "[∞]")
}
}
}
impl FromIterator<Literal> for Seq {
fn from_iter<T: IntoIterator<Item = Literal>>(it: T) -> Seq {
let mut seq = Seq::empty();
for literal in it {
seq.push(literal);
}
seq
}
}
#[derive(Clone, Eq, PartialEq, PartialOrd, Ord)]
pub struct Literal {
bytes: Vec<u8>,
exact: bool,
}
impl Literal {
#[inline]
pub fn exact<B: Into<Vec<u8>>>(bytes: B) -> Literal {
Literal { bytes: bytes.into(), exact: true }
}
#[inline]
pub fn inexact<B: Into<Vec<u8>>>(bytes: B) -> Literal {
Literal { bytes: bytes.into(), exact: false }
}
#[inline]
pub fn as_bytes(&self) -> &[u8] {
&self.bytes
}
#[inline]
pub fn into_bytes(self) -> Vec<u8> {
self.bytes
}
#[inline]
pub fn len(&self) -> usize {
self.as_bytes().len()
}
#[inline]
pub fn is_empty(&self) -> bool {
self.len() == 0
}
#[inline]
pub fn is_exact(&self) -> bool {
self.exact
}
#[inline]
pub fn make_inexact(&mut self) {
self.exact = false;
}
#[inline]
pub fn reverse(&mut self) {
self.bytes.reverse();
}
#[inline]
pub fn extend(&mut self, lit: &Literal) {
if !self.is_exact() {
return;
}
self.bytes.extend_from_slice(&lit.bytes);
}
#[inline]
pub fn keep_first_bytes(&mut self, len: usize) {
if len >= self.len() {
return;
}
self.make_inexact();
self.bytes.truncate(len);
}
#[inline]
pub fn keep_last_bytes(&mut self, len: usize) {
if len >= self.len() {
return;
}
self.make_inexact();
self.bytes.drain(..self.len() - len);
}
fn is_poisonous(&self) -> bool {
self.is_empty() || (self.len() == 1 && rank(self.as_bytes()[0]) >= 250)
}
}
impl From<u8> for Literal {
fn from(byte: u8) -> Literal {
Literal::exact(vec![byte])
}
}
impl From<char> for Literal {
fn from(ch: char) -> Literal {
use alloc::string::ToString;
Literal::exact(ch.encode_utf8(&mut [0; 4]).to_string())
}
}
impl AsRef<[u8]> for Literal {
fn as_ref(&self) -> &[u8] {
self.as_bytes()
}
}
impl core::fmt::Debug for Literal {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
let tag = if self.exact { "E" } else { "I" };
f.debug_tuple(tag)
.field(&crate::debug::Bytes(self.as_bytes()))
.finish()
}
}
#[derive(Debug)]
struct PreferenceTrie {
states: Vec<State>,
matches: Vec<Option<NonZeroUsize>>,
next_literal_index: usize,
}
#[derive(Debug, Default)]
struct State {
trans: Vec<(u8, usize)>,
}
impl PreferenceTrie {
fn minimize(literals: &mut Vec<Literal>, keep_exact: bool) {
let mut trie = PreferenceTrie {
states: vec![],
matches: vec![],
next_literal_index: 1,
};
let mut make_inexact = vec![];
literals.retain_mut(|lit| match trie.insert(lit.as_bytes()) {
Ok(_) => true,
Err(i) => {
if !keep_exact {
make_inexact.push(i.checked_sub(1).unwrap());
}
false
}
});
for i in make_inexact {
literals[i].make_inexact();
}
}
fn insert(&mut self, bytes: &[u8]) -> Result<usize, usize> {
let mut prev = self.root();
if let Some(idx) = self.matches[prev] {
return Err(idx.get());
}
for &b in bytes.iter() {
match self.states[prev].trans.binary_search_by_key(&b, |t| t.0) {
Ok(i) => {
prev = self.states[prev].trans[i].1;
if let Some(idx) = self.matches[prev] {
return Err(idx.get());
}
}
Err(i) => {
let next = self.create_state();
self.states[prev].trans.insert(i, (b, next));
prev = next;
}
}
}
let idx = self.next_literal_index;
self.next_literal_index += 1;
self.matches[prev] = NonZeroUsize::new(idx);
Ok(idx)
}
fn root(&mut self) -> usize {
if !self.states.is_empty() {
0
} else {
self.create_state()
}
}
fn create_state(&mut self) -> usize {
let id = self.states.len();
self.states.push(State::default());
self.matches.push(None);
id
}
}
pub fn rank(byte: u8) -> u8 {
crate::rank::BYTE_FREQUENCIES[usize::from(byte)]
}
#[cfg(test)]
mod tests {
use super::*;
fn parse(pattern: &str) -> Hir {
crate::ParserBuilder::new().utf8(false).build().parse(pattern).unwrap()
}
fn prefixes(pattern: &str) -> Seq {
Extractor::new().kind(ExtractKind::Prefix).extract(&parse(pattern))
}
fn suffixes(pattern: &str) -> Seq {
Extractor::new().kind(ExtractKind::Suffix).extract(&parse(pattern))
}
fn e(pattern: &str) -> (Seq, Seq) {
(prefixes(pattern), suffixes(pattern))
}
#[allow(non_snake_case)]
fn E(x: &str) -> Literal {
Literal::exact(x.as_bytes())
}
#[allow(non_snake_case)]
fn I(x: &str) -> Literal {
Literal::inexact(x.as_bytes())
}
fn seq<I: IntoIterator<Item = Literal>>(it: I) -> Seq {
Seq::from_iter(it)
}
fn infinite() -> (Seq, Seq) {
(Seq::infinite(), Seq::infinite())
}
fn inexact<I1, I2>(it1: I1, it2: I2) -> (Seq, Seq)
where
I1: IntoIterator<Item = Literal>,
I2: IntoIterator<Item = Literal>,
{
(Seq::from_iter(it1), Seq::from_iter(it2))
}
fn exact<B: AsRef<[u8]>, I: IntoIterator<Item = B>>(it: I) -> (Seq, Seq) {
let s1 = Seq::new(it);
let s2 = s1.clone();
(s1, s2)
}
fn opt<B: AsRef<[u8]>, I: IntoIterator<Item = B>>(it: I) -> (Seq, Seq) {
let (mut p, mut s) = exact(it);
p.optimize_for_prefix_by_preference();
s.optimize_for_suffix_by_preference();
(p, s)
}
#[test]
fn literal() {
assert_eq!(exact(["a"]), e("a"));
assert_eq!(exact(["aaaaa"]), e("aaaaa"));
assert_eq!(exact(["A", "a"]), e("(?i-u)a"));
assert_eq!(exact(["AB", "Ab", "aB", "ab"]), e("(?i-u)ab"));
assert_eq!(exact(["abC", "abc"]), e("ab(?i-u)c"));
assert_eq!(exact([b"\xFF"]), e(r"(?-u:\xFF)"));
#[cfg(feature = "unicode-case")]
{
assert_eq!(exact(["☃"]), e("☃"));
assert_eq!(exact(["☃"]), e("(?i)☃"));
assert_eq!(exact(["☃☃☃☃☃"]), e("☃☃☃☃☃"));
assert_eq!(exact(["Δ"]), e("Δ"));
assert_eq!(exact(["δ"]), e("δ"));
assert_eq!(exact(["Δ", "δ"]), e("(?i)Δ"));
assert_eq!(exact(["Δ", "δ"]), e("(?i)δ"));
assert_eq!(exact(["S", "s", "ſ"]), e("(?i)S"));
assert_eq!(exact(["S", "s", "ſ"]), e("(?i)s"));
assert_eq!(exact(["S", "s", "ſ"]), e("(?i)ſ"));
}
let letters = "ͱͳͷΐάέήίΰαβγδεζηθικλμνξοπρςστυφχψωϊϋ";
assert_eq!(exact([letters]), e(letters));
}
#[test]
fn class() {
assert_eq!(exact(["a", "b", "c"]), e("[abc]"));
assert_eq!(exact(["a1b", "a2b", "a3b"]), e("a[123]b"));
assert_eq!(exact(["δ", "ε"]), e("[εδ]"));
#[cfg(feature = "unicode-case")]
{
assert_eq!(exact(["Δ", "Ε", "δ", "ε", "ϵ"]), e(r"(?i)[εδ]"));
}
}
#[test]
fn look() {
assert_eq!(exact(["ab"]), e(r"a\Ab"));
assert_eq!(exact(["ab"]), e(r"a\zb"));
assert_eq!(exact(["ab"]), e(r"a(?m:^)b"));
assert_eq!(exact(["ab"]), e(r"a(?m:$)b"));
assert_eq!(exact(["ab"]), e(r"a\bb"));
assert_eq!(exact(["ab"]), e(r"a\Bb"));
assert_eq!(exact(["ab"]), e(r"a(?-u:\b)b"));
assert_eq!(exact(["ab"]), e(r"a(?-u:\B)b"));
assert_eq!(exact(["ab"]), e(r"^ab"));
assert_eq!(exact(["ab"]), e(r"$ab"));
assert_eq!(exact(["ab"]), e(r"(?m:^)ab"));
assert_eq!(exact(["ab"]), e(r"(?m:$)ab"));
assert_eq!(exact(["ab"]), e(r"\bab"));
assert_eq!(exact(["ab"]), e(r"\Bab"));
assert_eq!(exact(["ab"]), e(r"(?-u:\b)ab"));
assert_eq!(exact(["ab"]), e(r"(?-u:\B)ab"));
assert_eq!(exact(["ab"]), e(r"ab^"));
assert_eq!(exact(["ab"]), e(r"ab$"));
assert_eq!(exact(["ab"]), e(r"ab(?m:^)"));
assert_eq!(exact(["ab"]), e(r"ab(?m:$)"));
assert_eq!(exact(["ab"]), e(r"ab\b"));
assert_eq!(exact(["ab"]), e(r"ab\B"));
assert_eq!(exact(["ab"]), e(r"ab(?-u:\b)"));
assert_eq!(exact(["ab"]), e(r"ab(?-u:\B)"));
let expected = (seq([I("aZ"), E("ab")]), seq([I("Zb"), E("ab")]));
assert_eq!(expected, e(r"^aZ*b"));
}
#[test]
fn repetition() {
assert_eq!(exact(["a", ""]), e(r"a?"));
assert_eq!(exact(["", "a"]), e(r"a??"));
assert_eq!(inexact([I("a"), E("")], [I("a"), E("")]), e(r"a*"));
assert_eq!(inexact([E(""), I("a")], [E(""), I("a")]), e(r"a*?"));
assert_eq!(inexact([I("a")], [I("a")]), e(r"a+"));
assert_eq!(inexact([I("a")], [I("a")]), e(r"(a+)+"));
assert_eq!(exact(["ab"]), e(r"aZ{0}b"));
assert_eq!(exact(["aZb", "ab"]), e(r"aZ?b"));
assert_eq!(exact(["ab", "aZb"]), e(r"aZ??b"));
assert_eq!(
inexact([I("aZ"), E("ab")], [I("Zb"), E("ab")]),
e(r"aZ*b")
);
assert_eq!(
inexact([E("ab"), I("aZ")], [E("ab"), I("Zb")]),
e(r"aZ*?b")
);
assert_eq!(inexact([I("aZ")], [I("Zb")]), e(r"aZ+b"));
assert_eq!(inexact([I("aZ")], [I("Zb")]), e(r"aZ+?b"));
assert_eq!(exact(["aZZb"]), e(r"aZ{2}b"));
assert_eq!(inexact([I("aZZ")], [I("ZZb")]), e(r"aZ{2,3}b"));
assert_eq!(exact(["abc", ""]), e(r"(abc)?"));
assert_eq!(exact(["", "abc"]), e(r"(abc)??"));
assert_eq!(inexact([I("a"), E("b")], [I("ab"), E("b")]), e(r"a*b"));
assert_eq!(inexact([E("b"), I("a")], [E("b"), I("ab")]), e(r"a*?b"));
assert_eq!(inexact([I("ab")], [I("b")]), e(r"ab+"));
assert_eq!(inexact([I("a"), I("b")], [I("b")]), e(r"a*b+"));
assert_eq!(
inexact([I("a"), I("b"), E("c")], [I("bc"), I("ac"), E("c")]),
e(r"a*b*c")
);
assert_eq!(
inexact([I("a"), I("b"), E("c")], [I("bc"), I("ac"), E("c")]),
e(r"(a+)?(b+)?c")
);
assert_eq!(
inexact([I("a"), I("b"), E("c")], [I("bc"), I("ac"), E("c")]),
e(r"(a+|)(b+|)c")
);
assert_eq!(
inexact(
[I("a"), I("b"), I("c"), E("")],
[I("c"), I("b"), I("a"), E("")]
),
e(r"a*b*c*")
);
assert_eq!(inexact([I("a"), I("b"), I("c")], [I("c")]), e(r"a*b*c+"));
assert_eq!(inexact([I("a"), I("b")], [I("bc")]), e(r"a*b+c"));
assert_eq!(inexact([I("a"), I("b")], [I("c"), I("b")]), e(r"a*b+c*"));
assert_eq!(inexact([I("ab"), E("a")], [I("b"), E("a")]), e(r"ab*"));
assert_eq!(
inexact([I("ab"), E("ac")], [I("bc"), E("ac")]),
e(r"ab*c")
);
assert_eq!(inexact([I("ab")], [I("b")]), e(r"ab+"));
assert_eq!(inexact([I("ab")], [I("bc")]), e(r"ab+c"));
assert_eq!(
inexact([I("z"), E("azb")], [I("zazb"), E("azb")]),
e(r"z*azb")
);
let expected =
exact(["aaa", "aab", "aba", "abb", "baa", "bab", "bba", "bbb"]);
assert_eq!(expected, e(r"[ab]{3}"));
let expected = inexact(
[
I("aaa"),
I("aab"),
I("aba"),
I("abb"),
I("baa"),
I("bab"),
I("bba"),
I("bbb"),
],
[
I("aaa"),
I("aab"),
I("aba"),
I("abb"),
I("baa"),
I("bab"),
I("bba"),
I("bbb"),
],
);
assert_eq!(expected, e(r"[ab]{3,4}"));
}
#[test]
fn concat() {
let empty: [&str; 0] = [];
assert_eq!(exact(["abcxyz"]), e(r"abc()xyz"));
assert_eq!(exact(["abcxyz"]), e(r"(abc)(xyz)"));
assert_eq!(exact(["abcmnoxyz"]), e(r"abc()mno()xyz"));
assert_eq!(exact(empty), e(r"abc[a&&b]xyz"));
assert_eq!(exact(["abcxyz"]), e(r"abc[a&&b]*xyz"));
}
#[test]
fn alternation() {
assert_eq!(exact(["abc", "mno", "xyz"]), e(r"abc|mno|xyz"));
assert_eq!(
inexact(
[E("abc"), I("mZ"), E("mo"), E("xyz")],
[E("abc"), I("Zo"), E("mo"), E("xyz")]
),
e(r"abc|mZ*o|xyz")
);
assert_eq!(exact(["abc", "xyz"]), e(r"abc|M[a&&b]N|xyz"));
assert_eq!(exact(["abc", "MN", "xyz"]), e(r"abc|M[a&&b]*N|xyz"));
assert_eq!(exact(["aaa", "aaaaa"]), e(r"(?:|aa)aaa"));
assert_eq!(
inexact(
[I("aaa"), E(""), I("aaaaa"), E("aa")],
[I("aaa"), E(""), E("aa")]
),
e(r"(?:|aa)(?:aaa)*")
);
assert_eq!(
inexact(
[E(""), I("aaa"), E("aa"), I("aaaaa")],
[E(""), I("aaa"), E("aa")]
),
e(r"(?:|aa)(?:aaa)*?")
);
assert_eq!(
inexact([E("a"), I("b"), E("")], [E("a"), I("b"), E("")]),
e(r"a|b*")
);
assert_eq!(inexact([E("a"), I("b")], [E("a"), I("b")]), e(r"a|b+"));
assert_eq!(
inexact([I("a"), E("b"), E("c")], [I("ab"), E("b"), E("c")]),
e(r"a*b|c")
);
assert_eq!(
inexact(
[E("a"), E("b"), I("c"), E("")],
[E("a"), E("b"), I("c"), E("")]
),
e(r"a|(?:b|c*)")
);
assert_eq!(
inexact(
[I("a"), I("b"), E("c"), I("a"), I("ab"), E("c")],
[I("ac"), I("bc"), E("c"), I("ac"), I("abc"), E("c")],
),
e(r"(a|b)*c|(a|ab)*c")
);
assert_eq!(
exact(["abef", "abgh", "cdef", "cdgh"]),
e(r"(ab|cd)(ef|gh)")
);
assert_eq!(
exact([
"abefij", "abefkl", "abghij", "abghkl", "cdefij", "cdefkl",
"cdghij", "cdghkl",
]),
e(r"(ab|cd)(ef|gh)(ij|kl)")
);
assert_eq!(inexact([E("abab")], [E("abab")]), e(r"(ab){2}"));
assert_eq!(inexact([I("abab")], [I("abab")]), e(r"(ab){2,3}"));
assert_eq!(inexact([I("abab")], [I("abab")]), e(r"(ab){2,}"));
}
#[test]
fn impossible() {
let empty: [&str; 0] = [];
assert_eq!(exact(empty), e(r"[a&&b]"));
assert_eq!(exact(empty), e(r"a[a&&b]"));
assert_eq!(exact(empty), e(r"[a&&b]b"));
assert_eq!(exact(empty), e(r"a[a&&b]b"));
assert_eq!(exact(["a", "b"]), e(r"a|[a&&b]|b"));
assert_eq!(exact(["a", "b"]), e(r"a|c[a&&b]|b"));
assert_eq!(exact(["a", "b"]), e(r"a|[a&&b]d|b"));
assert_eq!(exact(["a", "b"]), e(r"a|c[a&&b]d|b"));
assert_eq!(exact([""]), e(r"[a&&b]*"));
assert_eq!(exact(["MN"]), e(r"M[a&&b]*N"));
}
#[test]
fn anything() {
assert_eq!(infinite(), e(r"."));
assert_eq!(infinite(), e(r"(?s)."));
assert_eq!(infinite(), e(r"[A-Za-z]"));
assert_eq!(infinite(), e(r"[A-Z]"));
assert_eq!(exact([""]), e(r"[A-Z]{0}"));
assert_eq!(infinite(), e(r"[A-Z]?"));
assert_eq!(infinite(), e(r"[A-Z]*"));
assert_eq!(infinite(), e(r"[A-Z]+"));
assert_eq!((seq([I("1")]), Seq::infinite()), e(r"1[A-Z]"));
assert_eq!((seq([I("1")]), seq([I("2")])), e(r"1[A-Z]2"));
assert_eq!((Seq::infinite(), seq([I("123")])), e(r"[A-Z]+123"));
assert_eq!(infinite(), e(r"[A-Z]+123[A-Z]+"));
assert_eq!(infinite(), e(r"1|[A-Z]|3"));
assert_eq!(
(seq([E("1"), I("2"), E("3")]), Seq::infinite()),
e(r"1|2[A-Z]|3"),
);
assert_eq!(
(Seq::infinite(), seq([E("1"), I("2"), E("3")])),
e(r"1|[A-Z]2|3"),
);
assert_eq!(
(seq([E("1"), I("2"), E("4")]), seq([E("1"), I("3"), E("4")])),
e(r"1|2[A-Z]3|4"),
);
assert_eq!((Seq::infinite(), seq([I("2")])), e(r"(?:|1)[A-Z]2"));
assert_eq!(inexact([I("a")], [I("z")]), e(r"a.z"));
}
#[test]
fn anything_small_limits() {
fn prefixes(pattern: &str) -> Seq {
Extractor::new()
.kind(ExtractKind::Prefix)
.limit_total(10)
.extract(&parse(pattern))
}
fn suffixes(pattern: &str) -> Seq {
Extractor::new()
.kind(ExtractKind::Suffix)
.limit_total(10)
.extract(&parse(pattern))
}
fn e(pattern: &str) -> (Seq, Seq) {
(prefixes(pattern), suffixes(pattern))
}
assert_eq!(
(
seq([
I("aaa"),
I("aab"),
I("aba"),
I("abb"),
I("baa"),
I("bab"),
I("bba"),
I("bbb")
]),
seq([
I("aaa"),
I("aab"),
I("aba"),
I("abb"),
I("baa"),
I("bab"),
I("bba"),
I("bbb")
])
),
e(r"[ab]{3}{3}")
);
assert_eq!(infinite(), e(r"ab|cd|ef|gh|ij|kl|mn|op|qr|st|uv|wx|yz"));
}
#[test]
fn empty() {
assert_eq!(exact([""]), e(r""));
assert_eq!(exact([""]), e(r"^"));
assert_eq!(exact([""]), e(r"$"));
assert_eq!(exact([""]), e(r"(?m:^)"));
assert_eq!(exact([""]), e(r"(?m:$)"));
assert_eq!(exact([""]), e(r"\b"));
assert_eq!(exact([""]), e(r"\B"));
assert_eq!(exact([""]), e(r"(?-u:\b)"));
assert_eq!(exact([""]), e(r"(?-u:\B)"));
}
#[test]
fn odds_and_ends() {
assert_eq!((Seq::infinite(), seq([I("a")])), e(r".a"));
assert_eq!((seq([I("a")]), Seq::infinite()), e(r"a."));
assert_eq!(infinite(), e(r"a|."));
assert_eq!(infinite(), e(r".|a"));
let pat = r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]";
let expected = inexact(
["Mo'am", "Moam", "Mu'am", "Muam"].map(I),
[
"ddafi", "ddafy", "dhafi", "dhafy", "dzafi", "dzafy", "dafi",
"dafy", "tdafi", "tdafy", "thafi", "thafy", "tzafi", "tzafy",
"tafi", "tafy", "zdafi", "zdafy", "zhafi", "zhafy", "zzafi",
"zzafy", "zafi", "zafy",
]
.map(I),
);
assert_eq!(expected, e(pat));
assert_eq!(
(seq(["fn is_", "fn as_"].map(I)), Seq::infinite()),
e(r"fn is_([A-Z]+)|fn as_([A-Z]+)"),
);
assert_eq!(
inexact([I("foo")], [I("quux")]),
e(r"foo[A-Z]+bar[A-Z]+quux")
);
assert_eq!(infinite(), e(r"[A-Z]+bar[A-Z]+"));
assert_eq!(
exact(["Sherlock Holmes"]),
e(r"(?m)^Sherlock Holmes|Sherlock Holmes$")
);
assert_eq!(exact(["sa", "sb"]), e(r"\bs(?:[ab])"));
}
#[test]
#[cfg(feature = "unicode-case")]
fn holmes() {
let expected = inexact(
["HOL", "HOl", "HoL", "Hol", "hOL", "hOl", "hoL", "hol"].map(I),
[
"MES", "MEs", "Eſ", "MeS", "Mes", "eſ", "mES", "mEs", "meS",
"mes",
]
.map(I),
);
let (mut prefixes, mut suffixes) = e(r"(?i)Holmes");
prefixes.keep_first_bytes(3);
suffixes.keep_last_bytes(3);
prefixes.minimize_by_preference();
suffixes.minimize_by_preference();
assert_eq!(expected, (prefixes, suffixes));
}
#[test]
#[cfg(feature = "unicode-case")]
fn holmes_alt() {
let mut pre =
prefixes(r"(?i)Sherlock|Holmes|Watson|Irene|Adler|John|Baker");
assert!(pre.len().unwrap() > 0);
pre.optimize_for_prefix_by_preference();
assert!(pre.len().unwrap() > 0);
}
#[test]
fn crazy_repeats() {
assert_eq!(inexact([E("")], [E("")]), e(r"(?:){4294967295}"));
assert_eq!(
inexact([E("")], [E("")]),
e(r"(?:){64}{64}{64}{64}{64}{64}")
);
assert_eq!(inexact([E("")], [E("")]), e(r"x{0}{4294967295}"));
assert_eq!(inexact([E("")], [E("")]), e(r"(?:|){4294967295}"));
assert_eq!(
inexact([E("")], [E("")]),
e(r"(?:){8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}")
);
let repa = "a".repeat(100);
assert_eq!(
inexact([I(&repa)], [I(&repa)]),
e(r"a{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}")
);
}
#[test]
fn huge() {
let pat = r#"(?-u)
2(?:
[45]\d{3}|
7(?:
1[0-267]|
2[0-289]|
3[0-29]|
4[01]|
5[1-3]|
6[013]|
7[0178]|
91
)|
8(?:
0[125]|
[139][1-6]|
2[0157-9]|
41|
6[1-35]|
7[1-5]|
8[1-8]|
90
)|
9(?:
0[0-2]|
1[0-4]|
2[568]|
3[3-6]|
5[5-7]|
6[0167]|
7[15]|
8[0146-9]
)
)\d{4}|
3(?:
12?[5-7]\d{2}|
0(?:
2(?:
[025-79]\d|
[348]\d{1,2}
)|
3(?:
[2-4]\d|
[56]\d?
)
)|
2(?:
1\d{2}|
2(?:
[12]\d|
[35]\d{1,2}|
4\d?
)
)|
3(?:
1\d{2}|
2(?:
[2356]\d|
4\d{1,2}
)
)|
4(?:
1\d{2}|
2(?:
2\d{1,2}|
[47]|
5\d{2}
)
)|
5(?:
1\d{2}|
29
)|
[67]1\d{2}|
8(?:
1\d{2}|
2(?:
2\d{2}|
3|
4\d
)
)
)\d{3}|
4(?:
0(?:
2(?:
[09]\d|
7
)|
33\d{2}
)|
1\d{3}|
2(?:
1\d{2}|
2(?:
[25]\d?|
[348]\d|
[67]\d{1,2}
)
)|
3(?:
1\d{2}(?:
\d{2}
)?|
2(?:
[045]\d|
[236-9]\d{1,2}
)|
32\d{2}
)|
4(?:
[18]\d{2}|
2(?:
[2-46]\d{2}|
3
)|
5[25]\d{2}
)|
5(?:
1\d{2}|
2(?:
3\d|
5
)
)|
6(?:
[18]\d{2}|
2(?:
3(?:
\d{2}
)?|
[46]\d{1,2}|
5\d{2}|
7\d
)|
5(?:
3\d?|
4\d|
[57]\d{1,2}|
6\d{2}|
8
)
)|
71\d{2}|
8(?:
[18]\d{2}|
23\d{2}|
54\d{2}
)|
9(?:
[18]\d{2}|
2[2-5]\d{2}|
53\d{1,2}
)
)\d{3}|
5(?:
02[03489]\d{2}|
1\d{2}|
2(?:
1\d{2}|
2(?:
2(?:
\d{2}
)?|
[457]\d{2}
)
)|
3(?:
1\d{2}|
2(?:
[37](?:
\d{2}
)?|
[569]\d{2}
)
)|
4(?:
1\d{2}|
2[46]\d{2}
)|
5(?:
1\d{2}|
26\d{1,2}
)|
6(?:
[18]\d{2}|
2|
53\d{2}
)|
7(?:
1|
24
)\d{2}|
8(?:
1|
26
)\d{2}|
91\d{2}
)\d{3}|
6(?:
0(?:
1\d{2}|
2(?:
3\d{2}|
4\d{1,2}
)
)|
2(?:
2[2-5]\d{2}|
5(?:
[3-5]\d{2}|
7
)|
8\d{2}
)|
3(?:
1|
2[3478]
)\d{2}|
4(?:
1|
2[34]
)\d{2}|
5(?:
1|
2[47]
)\d{2}|
6(?:
[18]\d{2}|
6(?:
2(?:
2\d|
[34]\d{2}
)|
5(?:
[24]\d{2}|
3\d|
5\d{1,2}
)
)
)|
72[2-5]\d{2}|
8(?:
1\d{2}|
2[2-5]\d{2}
)|
9(?:
1\d{2}|
2[2-6]\d{2}
)
)\d{3}|
7(?:
(?:
02|
[3-589]1|
6[12]|
72[24]
)\d{2}|
21\d{3}|
32
)\d{3}|
8(?:
(?:
4[12]|
[5-7]2|
1\d?
)|
(?:
0|
3[12]|
[5-7]1|
217
)\d
)\d{4}|
9(?:
[35]1|
(?:
[024]2|
81
)\d|
(?:
1|
[24]1
)\d{2}
)\d{3}
"#;
let (prefixes, suffixes) = e(pat);
assert!(!suffixes.is_finite());
assert_eq!(Some(243), prefixes.len());
}
#[test]
fn optimize() {
let (p, s) =
opt(["foobarfoobar", "foobar", "foobarzfoobar", "foobarfoobar"]);
assert_eq!(seq([I("foobar")]), p);
assert_eq!(seq([I("foobar")]), s);
let (p, s) = opt(["abba", "akka", "abccba"]);
assert_eq!(exact(["abba", "akka", "abccba"]), (p, s));
let (p, s) = opt(["sam", "samwise"]);
assert_eq!((seq([E("sam")]), seq([E("sam"), E("samwise")])), (p, s));
let (p, s) = opt(["foobarfoo", "foo", "", "foozfoo", "foofoo"]);
assert!(!p.is_finite());
assert!(!s.is_finite());
let mut p = seq([E("foobarfoo"), I("foo"), E(" "), E("foofoo")]);
p.optimize_for_prefix_by_preference();
assert!(!p.is_finite());
}
}