#![warn(missing_docs, clippy::pedantic)]
#![allow(dead_code)]
#![allow(unexpected_cfgs)]
#![allow(clippy::must_use_candidate)]
use crate::Visitor;
use arbitrary::{unstructured::Int, Result, Unstructured};
use regex_syntax::{
hir::{self, ClassBytes, ClassUnicode, Hir, HirKind, Repetition},
Parser,
};
use std::{
char,
cmp::Ordering,
error,
fmt::{self, Debug},
hash::{Hash, Hasher},
mem,
ops::RangeInclusive,
str::Utf8Error,
string::FromUtf8Error,
};
const SHORT_UNICODE_CLASS_COUNT: usize = 64;
#[derive(Debug, Clone, Eq, PartialEq)]
pub enum Error {
Anchor,
Syntax(Box<regex_syntax::Error>),
}
impl fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::Anchor => f.write_str("anchor is not supported"),
Self::Syntax(e) => fmt::Display::fmt(e, f),
}
}
}
impl error::Error for Error {
fn source(&self) -> Option<&(dyn error::Error + 'static)> {
match self {
Self::Anchor => None,
Self::Syntax(e) => Some(e),
}
}
}
impl From<regex_syntax::Error> for Error {
fn from(e: regex_syntax::Error) -> Self {
Self::Syntax(Box::new(e))
}
}
#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub enum Encoding {
Ascii = 0,
Utf8 = 1,
Binary = 2,
}
#[derive(Debug)]
enum Es {
Ascii(String),
Utf8(String),
Binary(FromUtf8Error),
}
#[derive(Debug)]
pub struct EncodedString(Es);
impl EncodedString {
pub fn as_bytes(&self) -> &[u8] {
match &self.0 {
Es::Ascii(s) | Es::Utf8(s) => s.as_bytes(),
Es::Binary(e) => e.as_bytes(),
}
}
pub fn as_str(&self) -> Result<&str, Utf8Error> {
match &self.0 {
Es::Ascii(s) | Es::Utf8(s) => Ok(s),
Es::Binary(e) => Err(e.utf8_error()),
}
}
pub fn encoding(&self) -> Encoding {
match self.0 {
Es::Ascii(_) => Encoding::Ascii,
Es::Utf8(_) => Encoding::Utf8,
Es::Binary(_) => Encoding::Binary,
}
}
}
impl From<EncodedString> for Vec<u8> {
fn from(es: EncodedString) -> Self {
match es.0 {
Es::Ascii(s) | Es::Utf8(s) => s.into_bytes(),
Es::Binary(e) => e.into_bytes(),
}
}
}
impl From<Vec<u8>> for EncodedString {
fn from(b: Vec<u8>) -> Self {
match String::from_utf8(b) {
Ok(s) => Self::from(s),
Err(e) => Self(Es::Binary(e)),
}
}
}
impl From<String> for EncodedString {
fn from(s: String) -> Self {
Self(if s.is_ascii() {
Es::Ascii(s)
} else {
Es::Utf8(s)
})
}
}
impl TryFrom<EncodedString> for String {
type Error = FromUtf8Error;
fn try_from(es: EncodedString) -> Result<Self, Self::Error> {
match es.0 {
Es::Ascii(s) | Es::Utf8(s) => Ok(s),
Es::Binary(e) => Err(e),
}
}
}
impl PartialEq for EncodedString {
fn eq(&self, other: &Self) -> bool {
self.as_bytes() == other.as_bytes()
}
}
impl Eq for EncodedString {}
impl PartialOrd for EncodedString {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Ord for EncodedString {
fn cmp(&self, other: &Self) -> Ordering {
self.as_bytes().cmp(other.as_bytes())
}
}
impl Hash for EncodedString {
fn hash<H: Hasher>(&self, state: &mut H) {
self.as_bytes().hash(state);
}
}
#[derive(Clone, Debug)]
pub struct Regex {
compiled: Compiled,
capacity: usize,
encoding: Encoding,
}
impl Distribution<Vec<u8>> for Regex {
fn sample<'a>(&self, u: &mut Unstructured<'a>) -> Result<Vec<u8>> {
let mut ctx = EvalCtx {
output: Vec::with_capacity(self.capacity),
};
ctx.eval(&self.compiled, u)?;
Ok(ctx.output)
}
}
impl Distribution<String> for Regex {
fn sample<'a>(&self, u: &mut Unstructured<'a>) -> Result<String> {
<Self as Distribution<Result<_, _>>>::sample(self, u).map(|r| r.unwrap())
}
}
impl Distribution<Result<String, FromUtf8Error>> for Regex {
fn sample<'a>(&self, u: &mut Unstructured<'a>) -> Result<Result<String, FromUtf8Error>> {
let bytes = <Self as Distribution<Vec<u8>>>::sample(self, u)?;
Ok(if self.is_utf8() {
unsafe { Ok(String::from_utf8_unchecked(bytes)) }
} else {
String::from_utf8(bytes)
})
}
}
impl Distribution<EncodedString> for Regex {
fn sample<'a>(&self, u: &mut Unstructured<'a>) -> Result<EncodedString> {
let result = <Self as Distribution<Result<_, _>>>::sample(self, u)?;
Ok(EncodedString(match result {
Err(e) => Es::Binary(e),
Ok(s) => {
if self.is_ascii() || s.is_ascii() {
Es::Ascii(s)
} else {
Es::Utf8(s)
}
}
}))
}
}
impl Default for Regex {
#[inline]
fn default() -> Self {
Self {
compiled: Compiled::default(),
capacity: 0,
encoding: Encoding::Ascii,
}
}
}
impl Regex {
pub(crate) fn visit<V: Visitor>(&self, v: &mut V, u: &mut Unstructured<'_>) -> Result<()> {
let sample: Vec<u8> = self.sample(u)?;
v.visit_regex(&sample);
Ok(())
}
pub const fn encoding(&self) -> Encoding {
self.encoding
}
#[inline]
pub const fn is_ascii(&self) -> bool {
(self.encoding as u8) == (Encoding::Ascii as u8)
}
#[inline]
pub const fn is_utf8(&self) -> bool {
(self.encoding as u8) <= (Encoding::Utf8 as u8)
}
#[inline]
pub const fn capacity(&self) -> usize {
self.capacity
}
pub fn compile(pattern: &str, max_repeat: u32) -> Result<Self, Error> {
let hir = Parser::new().parse(pattern)?;
Self::with_hir(hir, max_repeat)
}
pub fn with_hir(hir: Hir, max_repeat: u32) -> Result<Self, Error> {
match hir.into_kind() {
HirKind::Empty => Ok(Self::default()),
HirKind::Look(_) => Err(Error::Anchor),
HirKind::Capture(hir::Capture { sub, .. }) => Self::with_hir(*sub, max_repeat),
HirKind::Literal(hir::Literal(bytes)) => Ok(Self::with_bytes_literal(bytes.into())),
HirKind::Class(hir::Class::Unicode(class)) => Ok(Self::with_unicode_class(&class)),
HirKind::Class(hir::Class::Bytes(class)) => Ok(Self::with_byte_class(&class)),
HirKind::Repetition(rep) => Self::with_repetition(rep, max_repeat),
HirKind::Concat(hirs) => Self::with_sequence(hirs, max_repeat),
HirKind::Alternation(hirs) => Self::with_choices(hirs, max_repeat),
}
}
fn with_bytes_literal(bytes: Vec<u8>) -> Self {
let es = EncodedString::from(bytes);
let encoding = es.encoding();
let bytes = Vec::from(es);
Self {
capacity: bytes.len(),
compiled: Kind::Literal(bytes).into(),
encoding,
}
}
fn with_unicode_class(class: &ClassUnicode) -> Self {
if let Some(byte_class) = class.to_byte_class() {
Self::with_byte_class(&byte_class)
} else {
Self {
compiled: compile_unicode_class(class.ranges()).into(),
capacity: class.maximum_len().unwrap_or(0),
encoding: Encoding::Utf8,
}
}
}
fn with_byte_class(class: &ClassBytes) -> Self {
Self {
compiled: Kind::ByteClass(ByteClass::compile(class.ranges())).into(),
capacity: 1,
encoding: if class.is_ascii() {
Encoding::Ascii
} else {
Encoding::Binary
},
}
}
fn with_repetition(rep: Repetition, max_repeat: u32) -> Result<Self, Error> {
let lower = rep.min;
let upper = rep.max.unwrap_or(lower + max_repeat);
if upper == 0 {
return Ok(Self::default());
}
let mut regex = Self::with_hir(*rep.sub, max_repeat)?;
regex.capacity *= upper as usize;
if lower == upper {
regex.compiled.repeat_const *= upper;
} else {
regex
.compiled
.repeat_ranges
.push(Uniform::new_inclusive(lower, upper));
}
if let Kind::Literal(lit) = &mut regex.compiled.kind {
if regex.compiled.repeat_const > 1 {
*lit = lit.repeat(regex.compiled.repeat_const as usize);
regex.compiled.repeat_const = 1;
}
}
Ok(regex)
}
fn with_sequence(hirs: Vec<Hir>, max_repeat: u32) -> Result<Self, Error> {
let mut seq = Vec::with_capacity(hirs.len());
let mut capacity = 0;
let mut encoding = Encoding::Ascii;
for hir in hirs {
let regex = Self::with_hir(hir, max_repeat)?;
capacity += regex.capacity;
encoding = encoding.max(regex.encoding);
let compiled = regex.compiled;
if compiled.is_single() {
if let Kind::Sequence(mut s) = compiled.kind {
seq.append(&mut s);
continue;
}
}
seq.push(compiled);
}
let mut simplified = Vec::with_capacity(seq.len());
let mut combined_lit = Vec::new();
for cur in seq {
if cur.is_single() {
if let Kind::Literal(mut lit) = cur.kind {
combined_lit.append(&mut lit);
continue;
}
}
if !combined_lit.is_empty() {
simplified.push(Kind::Literal(mem::take(&mut combined_lit)).into());
}
simplified.push(cur);
}
if !combined_lit.is_empty() {
simplified.push(Kind::Literal(combined_lit).into());
}
let compiled = match simplified.len() {
0 => return Ok(Self::default()),
1 => simplified.swap_remove(0),
_ => Kind::Sequence(simplified).into(),
};
Ok(Self {
compiled,
capacity,
encoding,
})
}
fn with_choices(hirs: Vec<Hir>, max_repeat: u32) -> Result<Self, Error> {
let mut choices = Vec::with_capacity(hirs.len());
let mut capacity = 0;
let mut encoding = Encoding::Ascii;
for hir in hirs {
let regex = Self::with_hir(hir, max_repeat)?;
if regex.capacity > capacity {
capacity = regex.capacity;
}
encoding = encoding.max(regex.encoding);
let compiled = regex.compiled;
if compiled.is_single() {
if let Kind::Any {
choices: mut sc, ..
} = compiled.kind
{
choices.append(&mut sc);
continue;
}
}
choices.push(compiled);
}
Ok(Self {
compiled: Kind::Any {
index: Uniform::new_inclusive(0, choices.len() - 1),
choices,
}
.into(),
capacity,
encoding,
})
}
}
#[derive(Clone, Debug)]
struct Compiled {
repeat_const: u32,
repeat_ranges: Vec<Uniform<u32>>,
kind: Kind,
}
impl Default for Compiled {
fn default() -> Self {
Kind::default().into()
}
}
impl Compiled {
fn is_single(&self) -> bool {
self.repeat_const == 1 && self.repeat_ranges.is_empty()
}
}
#[derive(Clone, Debug)]
enum Kind {
Literal(Vec<u8>),
Sequence(Vec<Compiled>),
Any {
index: Uniform<usize>,
choices: Vec<Compiled>,
},
LongUnicodeClass(LongUnicodeClass),
ShortUnicodeClass(ShortUnicodeClass),
ByteClass(ByteClass),
}
impl Default for Kind {
fn default() -> Self {
Self::Literal(Vec::new())
}
}
impl From<Kind> for Compiled {
fn from(kind: Kind) -> Self {
Self {
repeat_const: 1,
repeat_ranges: Vec::new(),
kind,
}
}
}
struct EvalCtx {
output: Vec<u8>,
}
impl<'a> EvalCtx {
fn eval(&mut self, compiled: &Compiled, u: &mut Unstructured<'a>) -> Result<()> {
let mut count = compiled.repeat_const;
for cl in compiled.repeat_ranges.iter() {
count *= cl.sample(u)?;
}
match &compiled.kind {
Kind::Literal(lit) => Ok(self.eval_literal(count, lit)),
Kind::Sequence(seq) => self.eval_sequence(count, seq, u),
Kind::Any { index, choices } => self.eval_alt(count, index, choices, u),
Kind::LongUnicodeClass(class) => self.eval_unicode_class(count, class, u),
Kind::ShortUnicodeClass(class) => self.eval_unicode_class(count, class, u),
Kind::ByteClass(class) => self.eval_byte_class(count, class, u),
}
}
fn eval_literal(&mut self, count: u32, lit: &[u8]) {
for _ in 0..count {
self.output.extend_from_slice(lit);
}
}
fn eval_sequence(
&mut self,
count: u32,
seq: &[Compiled],
u: &mut Unstructured<'a>,
) -> Result<()> {
for _ in 0..count {
for compiled in seq {
self.eval(compiled, u)?;
}
}
Ok(())
}
fn eval_alt(
&mut self,
count: u32,
index: &Uniform<usize>,
choices: &[Compiled],
u: &mut Unstructured<'a>,
) -> Result<()> {
for _ in 0..count {
let idx = index.sample(u)?;
self.eval(&choices[idx], u)?;
}
Ok(())
}
fn eval_unicode_class(
&mut self,
count: u32,
class: &impl Distribution<char>,
u: &mut Unstructured<'a>,
) -> Result<()> {
let mut buf = [0; 4];
for _ in 0..(count as usize) {
let c = class.sample(u)?;
let bytes = c.encode_utf8(&mut buf).as_bytes();
self.output.extend_from_slice(bytes);
}
Ok(())
}
fn eval_byte_class(
&mut self,
count: u32,
class: &ByteClass,
u: &mut Unstructured<'a>,
) -> Result<()> {
for _ in 0..(count as usize) {
let c = class.sample(u)?;
self.output.push(c);
}
Ok(())
}
}
#[derive(Clone, Debug)]
struct LongUnicodeClass {
searcher: Uniform<u32>,
ranges: Box<[(u32, u32)]>,
}
impl Distribution<char> for LongUnicodeClass {
fn sample<'a>(&self, u: &mut Unstructured<'a>) -> Result<char> {
let normalized_index = self.searcher.sample(u)?;
let entry_index = self
.ranges
.binary_search_by(|(normalized_start, _)| normalized_start.cmp(&normalized_index))
.unwrap_or_else(|e| e - 1);
let code = normalized_index + self.ranges[entry_index].1;
Ok(char::from_u32(code).expect("valid char"))
}
}
#[derive(Clone, Debug)]
struct ShortUnicodeClass {
index: Uniform<usize>,
cases: Box<[char]>,
}
impl Distribution<char> for ShortUnicodeClass {
fn sample<'a>(&self, u: &mut Unstructured<'a>) -> Result<char> {
Ok(self.cases[self.index.sample(u)?])
}
}
fn compile_unicode_class_with(ranges: &[hir::ClassUnicodeRange], mut push: impl FnMut(char, char)) {
for range in ranges {
let start = range.start();
let end = range.end();
if start <= '\u{d7ff}' && '\u{e000}' <= end {
push(start, '\u{d7ff}');
push('\u{e000}', end);
} else {
push(start, end);
}
}
}
fn compile_unicode_class(ranges: &[hir::ClassUnicodeRange]) -> Kind {
let mut normalized_ranges = Vec::new();
let mut normalized_len = 0;
compile_unicode_class_with(ranges, |start, end| {
let start = u32::from(start);
let end = u32::from(end);
normalized_ranges.push((normalized_len, start - normalized_len));
normalized_len += end - start + 1;
});
if normalized_len as usize > SHORT_UNICODE_CLASS_COUNT {
return Kind::LongUnicodeClass(LongUnicodeClass {
searcher: Uniform::new_inclusive(0, normalized_len - 1),
ranges: normalized_ranges.into_boxed_slice(),
});
}
let mut cases = Vec::with_capacity(normalized_len as usize);
compile_unicode_class_with(ranges, |start, end| {
for c in u32::from(start)..=u32::from(end) {
cases.push(char::from_u32(c).expect("valid char"));
}
});
Kind::ShortUnicodeClass(ShortUnicodeClass {
index: Uniform::new_inclusive(0, cases.len() - 1),
cases: cases.into_boxed_slice(),
})
}
#[derive(Clone, Debug)]
struct ByteClass {
index: Uniform<usize>,
cases: Box<[u8]>,
}
impl ByteClass {
fn compile(ranges: &[hir::ClassBytesRange]) -> Self {
let mut cases = Vec::with_capacity(256);
for range in ranges {
cases.extend(range.start()..=range.end());
}
Self {
index: Uniform::new_inclusive(0, cases.len() - 1),
cases: cases.into_boxed_slice(),
}
}
}
impl Distribution<u8> for ByteClass {
fn sample<'a>(&self, u: &mut Unstructured<'a>) -> Result<u8> {
Ok(self.cases[self.index.sample(u)?])
}
}
trait Distribution<T> {
fn sample<'a>(&self, u: &mut Unstructured<'a>) -> Result<T>;
}
#[derive(Debug, Clone)]
struct Uniform<T: Int>(RangeInclusive<T>);
impl<T: Int> Uniform<T> {
fn new_inclusive(start: T, end: T) -> Self {
Self(start..=end)
}
}
impl<T: Int> Distribution<T> for Uniform<T> {
fn sample<'a>(&self, u: &mut Unstructured<'a>) -> Result<T> {
u.int_in_range(self.0.clone())
}
}
#[cfg(test)]
mod test {
use super::*;
use rand::RngCore;
use rand::SeedableRng;
use std::collections::HashSet;
use std::ops::RangeInclusive;
pub fn rand_u_seed<'a>(buf: &'a mut [u8], seed: u64) -> Unstructured<'a> {
let mut rng: rand::rngs::StdRng = SeedableRng::seed_from_u64(seed);
rng.fill_bytes(buf);
Unstructured::new(buf)
}
fn check_str(
pattern: &str,
encoding: Encoding,
distinct_count: RangeInclusive<usize>,
run_count: usize,
) {
let r = regex::Regex::new(pattern).unwrap();
let regex_gen = Regex::compile(pattern, 100).unwrap();
assert!(regex_gen.is_utf8());
assert_eq!(regex_gen.encoding(), encoding);
let mut gen_set = HashSet::<String>::with_capacity(run_count.min(*distinct_count.end()));
let mut buf = [0; 4096];
for seed in 0..run_count {
let mut u = rand_u_seed(&mut buf, seed as u64);
let res: String = regex_gen.sample(&mut u).unwrap();
assert!(res.len() <= regex_gen.capacity());
assert!(
r.is_match(&res),
"Wrong sample for pattern `{}`: `{}`",
pattern,
res
);
gen_set.insert(res);
}
let gen_count = gen_set.len();
assert!(
*distinct_count.start() <= gen_count && gen_count <= *distinct_count.end(),
"Distinct samples generated for pattern `{}` outside the range {:?}: {} (examples:\n{})",
pattern,
distinct_count,
gen_count,
gen_set
.iter()
.take(10)
.map(|s| format!(" - {:#?}\n", s))
.collect::<String>(),
);
}
fn run_count_for_distinct_count(distinct_count: usize) -> usize {
if distinct_count <= 1 {
return 8;
}
let n = distinct_count as f64;
((n.ln() + 6.0 * std::f64::consts::LN_10) / (n.ln() - (n - 1.0).ln())).ceil() as usize
}
#[test]
fn sanity_test_run_count() {
assert_eq!(run_count_for_distinct_count(1), 8);
assert_eq!(run_count_for_distinct_count(2), 21);
assert_eq!(run_count_for_distinct_count(3), 37);
assert_eq!(run_count_for_distinct_count(10), 153);
assert_eq!(run_count_for_distinct_count(26), 436);
assert_eq!(run_count_for_distinct_count(62), 1104);
assert_eq!(run_count_for_distinct_count(128), 2381);
assert_eq!(run_count_for_distinct_count(214), 4096);
}
fn check_str_limited(pattern: &str, encoding: Encoding, distinct_count: usize) {
let run_count = run_count_for_distinct_count(distinct_count);
check_str(
pattern,
encoding,
distinct_count..=distinct_count,
run_count,
);
}
fn check_str_unlimited(pattern: &str, encoding: Encoding, min_distinct_count: usize) {
check_str(pattern, encoding, min_distinct_count..=4096, 4096);
}
#[test]
fn test_proptest() {
check_str_limited("foo", Encoding::Ascii, 1);
check_str_limited("foo|bar|baz", Encoding::Ascii, 3);
check_str_limited("a{0,8}", Encoding::Ascii, 9);
check_str_limited("a?", Encoding::Ascii, 2);
check_str_limited("a*", Encoding::Ascii, 101);
check_str_limited("a+", Encoding::Ascii, 101);
check_str_limited("a{4,}", Encoding::Ascii, 101);
check_str_limited("(foo|bar)(xyzzy|plugh)", Encoding::Ascii, 4);
check_str_unlimited(".", Encoding::Utf8, 4075);
check_str_unlimited("(?s).", Encoding::Utf8, 4075);
}
#[test]
fn test_regex_generate() {
check_str_limited("", Encoding::Ascii, 1);
check_str_limited("aBcDe", Encoding::Ascii, 1);
check_str_limited("[a-zA-Z0-9]", Encoding::Ascii, 62);
check_str_limited("a{3,8}", Encoding::Ascii, 6);
check_str_limited("a{3}", Encoding::Ascii, 1);
check_str_limited("a{3}-a{3}", Encoding::Ascii, 1);
check_str_limited("(abcde)", Encoding::Ascii, 1);
check_str_limited("a?b?", Encoding::Ascii, 4);
}
#[test]
#[cfg(feature = "unicode")]
fn test_unicode_cases() {
check_str_limited("(?i:fOo)", Encoding::Ascii, 8);
check_str_limited("(?i:a|B)", Encoding::Ascii, 4);
check_str_unlimited(r"(\p{Greek}\P{Greek})(?:\d{3,6})", Encoding::Utf8, 4096);
}
#[test]
fn test_ascii_character_classes() {
check_str_limited("[[:alnum:]]", Encoding::Ascii, 62);
check_str_limited("[[:alpha:]]", Encoding::Ascii, 52);
check_str_limited("[[:ascii:]]", Encoding::Ascii, 128);
check_str_limited("[[:blank:]]", Encoding::Ascii, 2);
check_str_limited("[[:cntrl:]]", Encoding::Ascii, 33);
check_str_limited("[[:digit:]]", Encoding::Ascii, 10);
check_str_limited("[[:graph:]]", Encoding::Ascii, 94);
check_str_limited("[[:lower:]]", Encoding::Ascii, 26);
check_str_limited("[[:print:]]", Encoding::Ascii, 95);
check_str_limited("[[:punct:]]", Encoding::Ascii, 32);
check_str_limited("[[:space:]]", Encoding::Ascii, 6);
check_str_limited("[[:upper:]]", Encoding::Ascii, 26);
check_str_limited("[[:word:]]", Encoding::Ascii, 63);
check_str_limited("[[:xdigit:]]", Encoding::Ascii, 22);
}
#[test]
#[cfg(feature = "unicode")]
fn sanity_test_unicode_character_classes_size() {
fn count_class_chars(pattern: &str) -> usize {
use regex_syntax::{
hir::{Class, HirKind},
parse,
};
let hir = parse(pattern).unwrap();
let HirKind::Class(Class::Unicode(cls)) = hir.into_kind() else {
unreachable!()
};
cls.iter().map(|r| r.len()).sum()
}
assert_eq!(count_class_chars(r"\p{L}"), 141_028);
assert_eq!(count_class_chars(r"\p{M}"), 2_501);
assert_eq!(count_class_chars(r"\p{N}"), 1_911);
assert_eq!(count_class_chars(r"\p{P}"), 855);
assert_eq!(count_class_chars(r"\p{S}"), 8_514);
assert_eq!(count_class_chars(r"\p{Z}"), 19);
assert_eq!(count_class_chars(r"\p{C}"), 959_284);
assert_eq!(count_class_chars(r"\p{Latin}"), 1_487);
assert_eq!(count_class_chars(r"\p{Greek}"), 518);
assert_eq!(count_class_chars(r"\p{Cyrillic}"), 508);
assert_eq!(count_class_chars(r"\p{Armenian}"), 96);
assert_eq!(count_class_chars(r"\p{Hebrew}"), 134);
assert_eq!(count_class_chars(r"\p{Arabic}"), 1_373);
assert_eq!(count_class_chars(r"\p{Syriac}"), 88);
assert_eq!(count_class_chars(r"\p{Thaana}"), 50);
assert_eq!(count_class_chars(r"\p{Devanagari}"), 164);
assert_eq!(count_class_chars(r"\p{Bengali}"), 96);
assert_eq!(count_class_chars(r"\p{Gurmukhi}"), 80);
assert_eq!(count_class_chars(r"\p{Gujarati}"), 91);
assert_eq!(count_class_chars(r"\p{Oriya}"), 91);
assert_eq!(count_class_chars(r"\p{Tamil}"), 123);
assert_eq!(count_class_chars(r"\p{Hangul}"), 11_739);
assert_eq!(count_class_chars(r"\p{Hiragana}"), 381);
assert_eq!(count_class_chars(r"\p{Katakana}"), 321);
assert_eq!(count_class_chars(r"\p{Han}"), 99_030);
assert_eq!(count_class_chars(r"\p{Tagalog}"), 23);
assert_eq!(count_class_chars(r"\p{Linear_B}"), 211);
assert_eq!(count_class_chars(r"\p{Inherited}"), 657);
assert_eq!(count_class_chars(r"\d"), 760);
assert_eq!(count_class_chars(r"\s"), 25);
assert_eq!(count_class_chars(r"\w"), 144_667);
}
#[test]
#[cfg(feature = "unicode")]
fn test_unicode_character_classes() {
check_str_unlimited(r"\p{L}", Encoding::Utf8, 3999);
check_str(r"\p{M}", Encoding::Utf8, 1941..=2501, 4096);
check_str(r"\p{N}", Encoding::Utf8, 1630..=1911, 4096);
check_str(r"\p{P}", Encoding::Utf8, 835..=855, 4096);
check_str_unlimited(r"\p{S}", Encoding::Utf8, 3151);
check_str_limited(r"\p{Z}", Encoding::Utf8, 19);
check_str_unlimited(r"\p{C}", Encoding::Utf8, 4073);
check_str_unlimited(r"\P{L}", Encoding::Utf8, 4073);
check_str_unlimited(r"\P{M}", Encoding::Utf8, 4075);
check_str_unlimited(r"\P{N}", Encoding::Utf8, 4075);
check_str_unlimited(r"\P{P}", Encoding::Utf8, 4075);
check_str_unlimited(r"\P{S}", Encoding::Utf8, 4075);
check_str_unlimited(r"\P{Z}", Encoding::Utf8, 4075);
check_str_unlimited(r"\P{C}", Encoding::Utf8, 4007);
}
#[test]
#[cfg(feature = "unicode")]
fn test_unicode_script_classes() {
check_str(r"\p{Latin}", Encoding::Utf8, 1352..=1487, 4096);
check_str(r"\p{Greek}", Encoding::Utf8, 516..=518, 4096);
check_str(r"\p{Cyrillic}", Encoding::Utf8, 506..=508, 4096);
check_str_limited(r"\p{Armenian}", Encoding::Utf8, 96);
check_str_limited(r"\p{Hebrew}", Encoding::Utf8, 134);
check_str(r"\p{Arabic}", Encoding::Utf8, 1268..=1373, 4096);
check_str_limited(r"\p{Syriac}", Encoding::Utf8, 88);
check_str_limited(r"\p{Thaana}", Encoding::Utf8, 50);
check_str_limited(r"\p{Devanagari}", Encoding::Utf8, 164);
check_str_limited(r"\p{Bengali}", Encoding::Utf8, 96);
check_str_limited(r"\p{Gurmukhi}", Encoding::Utf8, 80);
check_str_limited(r"\p{Gujarati}", Encoding::Utf8, 91);
check_str_limited(r"\p{Oriya}", Encoding::Utf8, 91);
check_str_limited(r"\p{Tamil}", Encoding::Utf8, 123);
check_str_unlimited(r"\p{Hangul}", Encoding::Utf8, 3363);
check_str_limited(r"\p{Hiragana}", Encoding::Utf8, 381);
check_str_limited(r"\p{Katakana}", Encoding::Utf8, 321);
check_str_unlimited(r"\p{Han}", Encoding::Utf8, 3970);
check_str_limited(r"\p{Tagalog}", Encoding::Utf8, 23);
check_str_limited(r"\p{Linear_B}", Encoding::Utf8, 211);
check_str(r"\p{Inherited}", Encoding::Utf8, 650..=657, 4096);
}
#[test]
#[cfg(feature = "unicode")]
fn test_perl_classes() {
check_str_unlimited(r"\d+", Encoding::Utf8, 4061);
check_str_unlimited(r"\D+", Encoding::Utf8, 4096);
check_str_unlimited(r"\s+", Encoding::Utf8, 4014);
check_str_unlimited(r"\S+", Encoding::Utf8, 4096);
check_str_unlimited(r"\w+", Encoding::Utf8, 4095);
check_str_unlimited(r"\W+", Encoding::Utf8, 4096);
}
#[cfg(any())]
fn dump_categories() {
use regex_syntax::hir::*;
let categories = &[r"\p{Nd}", r"\p{Greek}"];
for cat in categories {
if let HirKind::Class(Class::Unicode(cls)) =
regex_syntax::Parser::new().parse(cat).unwrap().into_kind()
{
let s: u32 = cls
.iter()
.map(|r| u32::from(r.end()) - u32::from(r.start()) + 1)
.sum();
println!("{} => {}", cat, s);
}
}
}
#[test]
fn test_binary_generator() {
const PATTERN: &str = r"PE\x00\x00.{20}";
let r = regex::bytes::RegexBuilder::new(PATTERN)
.unicode(false)
.dot_matches_new_line(true)
.build()
.unwrap();
let hir = regex_syntax::ParserBuilder::new()
.unicode(false)
.dot_matches_new_line(true)
.utf8(false)
.build()
.parse(PATTERN)
.unwrap();
let regex_gen = Regex::with_hir(hir, 100).unwrap();
assert_eq!(regex_gen.capacity(), 24);
assert!(!regex_gen.is_utf8());
assert_eq!(regex_gen.encoding(), Encoding::Binary);
let mut buf = [0; 4096];
for seed in 0..8192 {
let mut u = rand_u_seed(&mut buf, seed as u64);
let res: Vec<u8> = regex_gen.sample(&mut u).unwrap();
assert!(r.is_match(&res), "Wrong sample: {:?}, `{:?}`", r, res);
}
}
#[test]
fn test_encoding_generator_1() {
let hir = regex_syntax::ParserBuilder::new()
.unicode(false)
.utf8(false)
.build()
.parse(r"[\x00-\xff]{2}")
.unwrap();
let regex_gen = Regex::with_hir(hir, 100).unwrap();
let mut encoding_counts = [0; 3];
let mut buf = [0; 4096];
for seed in 0..8192 {
let mut u = rand_u_seed(&mut buf, seed as u64);
let encoded_string: EncodedString = regex_gen.sample(&mut u).unwrap();
let bytes = encoded_string.as_bytes();
let encoding = encoded_string.encoding();
assert_eq!(bytes.len(), 2);
if bytes.is_ascii() {
assert_eq!(encoding, Encoding::Ascii);
} else if std::str::from_utf8(bytes).is_ok() {
assert_eq!(encoding, Encoding::Utf8);
} else {
assert_eq!(encoding, Encoding::Binary);
}
encoding_counts[encoding as usize] += 1;
}
assert!((1858..2243).contains(&encoding_counts[Encoding::Ascii as usize]));
assert!((169..319).contains(&encoding_counts[Encoding::Utf8 as usize]));
assert!((5704..6102).contains(&encoding_counts[Encoding::Binary as usize]));
}
#[test]
fn test_encoding_generator_2() {
let regex_gen = Regex::compile(r"[\u{0}-\u{b5}]{2}", 100).unwrap();
let mut encoding_counts = [0; 2];
let mut buf = [0; 4096];
for seed in 0..8192 {
let mut u = rand_u_seed(&mut buf, seed as u64);
let encoded_string: EncodedString = regex_gen.sample(&mut u).unwrap();
let encoding = encoded_string.encoding();
let string = encoded_string.as_str().unwrap();
assert_eq!(string.chars().count(), 2);
if string.is_ascii() {
assert_eq!(encoding, Encoding::Ascii);
assert_eq!(string.len(), 2);
} else {
assert_eq!(encoding, Encoding::Utf8);
}
encoding_counts[encoding as usize] += 1;
}
}
#[test]
fn test_encoding_generator_3() {
let regex_gen = Regex::compile(r"[\u{0}-\u{7f}]{2}", 100).unwrap();
let mut buf = [0; 4096];
for seed in 0..8192 {
let mut u = rand_u_seed(&mut buf, seed as u64);
let encoded_string: EncodedString = regex_gen.sample(&mut u).unwrap();
assert_eq!(encoded_string.encoding(), Encoding::Ascii);
assert_eq!(String::try_from(encoded_string).unwrap().len(), 2);
}
}
#[test]
#[should_panic(expected = "FromUtf8Error")]
fn test_generating_non_utf8_string() {
let hir = regex_syntax::ParserBuilder::new()
.unicode(false)
.utf8(false)
.build()
.parse(r"\x88")
.unwrap();
let regex_gen = Regex::with_hir(hir, 100).unwrap();
assert!(!regex_gen.is_utf8());
assert_eq!(regex_gen.encoding(), Encoding::Binary);
let mut buf = [0; 4096];
let mut u = rand_u_seed(&mut buf, 1);
let _: String = regex_gen.sample(&mut u).unwrap();
}
}