rand_regex/
lib.rs

1#![warn(missing_docs, clippy::pedantic)]
2
3//! Generates strings are byte strings following rule of a regular expression.
4//!
5//! ```
6//! # #[cfg(feature = "unicode")] {
7//! use rand::{SeedableRng, Rng};
8//!
9//! let mut rng = rand_xorshift::XorShiftRng::from_seed(*b"The initial seed");
10//!
11//! // creates a generator for sampling strings
12//! let gen = rand_regex::Regex::compile(r"\d{4}-\d{2}-\d{2}", 100).unwrap();
13//!
14//! // sample a few strings randomly
15//! let samples = (&mut rng).sample_iter(&gen).take(3).collect::<Vec<String>>();
16//!
17//! // all Unicode characters are included when sampling
18//! assert_eq!(samples, vec![
19//!     "꧰᪈৭᱃-𐒧᧒-௦۴".to_string(),
20//!     "𞓰۳𑛐꩑-᪄9-໔᮹".to_string(),
21//!     "𑛃𑃹९೭-١᥈-৫೪".to_string()
22//! ]);
23//!
24//! // you could use `regex_syntax::Hir` to include more options
25//! let mut parser = regex_syntax::ParserBuilder::new().unicode(false).build();
26//! let hir = parser.parse(r"\d{4}-\d{2}-\d{2}").unwrap();
27//! let gen = rand_regex::Regex::with_hir(hir, 100).unwrap();
28//! let samples = (&mut rng).sample_iter(&gen).take(3).collect::<Vec<String>>();
29//! assert_eq!(samples, vec![
30//!     "2839-82-12".to_string(),
31//!     "2857-86-63".to_string(),
32//!     "0381-04-99".to_string(),
33//! ]);
34//! # }
35//! ```
36
37#![allow(clippy::must_use_candidate)]
38
39use rand::{
40    distr::{uniform::Uniform, Distribution},
41    Rng,
42};
43use regex_syntax::{
44    hir::{self, ClassBytes, ClassUnicode, Hir, HirKind, Repetition},
45    Parser,
46};
47use std::{
48    char,
49    cmp::Ordering,
50    error,
51    fmt::{self, Debug},
52    hash::{Hash, Hasher},
53    mem,
54    str::Utf8Error,
55    string::FromUtf8Error,
56};
57
58const SHORT_UNICODE_CLASS_COUNT: usize = 64;
59
60/// Error returned by [`Regex::compile()`] and [`Regex::with_hir()`].
61///
62/// # Examples
63///
64/// ```
65/// let gen = rand_regex::Regex::compile(r"^.{4}\b.{4}$", 100);
66/// assert_eq!(gen.err(), Some(rand_regex::Error::Anchor));
67/// ```
68#[derive(Debug, Clone, Eq, PartialEq)]
69pub enum Error {
70    /// Anchors (`^`, `$`, `\A`, `\z`) and word boundary assertions (`\b`, `\B`)
71    /// are not supported.
72    ///
73    /// If you really need to include anchors, please consider using rejection
74    /// sampling e.g.
75    ///
76    /// ```rust
77    /// # #[cfg(feature = "unicode")] {
78    /// use rand::Rng;
79    ///
80    /// // create the generator without the anchor
81    /// let gen = rand_regex::Regex::compile(r".{4}.{4}", 100).unwrap();
82    ///
83    /// // later filter the sampled result using a regex with the anchor
84    /// let filter_regex = regex::Regex::new(r"^.{4}\b.{4}$").unwrap();
85    /// let _sample = rand::thread_rng()
86    ///     .sample_iter::<String, _>(&gen)
87    ///     .filter(|s| filter_regex.is_match(s))
88    ///     .next()
89    ///     .unwrap();
90    /// # }
91    /// ```
92    Anchor,
93
94    /// The input regex has a syntax error.
95    ///
96    /// # Examples
97    ///
98    /// ```
99    /// let gen = rand_regex::Regex::compile(r"(", 100);
100    /// assert!(match gen {
101    ///     Err(rand_regex::Error::Syntax(_)) => true,
102    ///     _ => false,
103    /// });
104    /// ```
105    Syntax(Box<regex_syntax::Error>),
106
107    /// The regex can never be matched, and thus it is impossible to generate
108    /// any samples from the regex.
109    ///
110    /// # Examples
111    ///
112    /// ```
113    /// let err = rand_regex::Regex::compile(r"[a&&b]", 100).unwrap_err();
114    /// assert_eq!(err, rand_regex::Error::Unsatisfiable);
115    /// ```
116    Unsatisfiable,
117}
118
119impl fmt::Display for Error {
120    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
121        match self {
122            Self::Unsatisfiable => f.write_str("regex is unsatisfiable"),
123            Self::Anchor => f.write_str("anchor is not supported"),
124            Self::Syntax(e) => fmt::Display::fmt(e, f),
125        }
126    }
127}
128
129impl error::Error for Error {
130    fn source(&self) -> Option<&(dyn error::Error + 'static)> {
131        match self {
132            Self::Unsatisfiable => None,
133            Self::Anchor => None,
134            Self::Syntax(e) => Some(e),
135        }
136    }
137}
138
139impl From<regex_syntax::Error> for Error {
140    fn from(e: regex_syntax::Error) -> Self {
141        Self::Syntax(Box::new(e))
142    }
143}
144
145/// String encoding.
146#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
147pub enum Encoding {
148    /// ASCII.
149    Ascii = 0,
150    /// UTF-8.
151    Utf8 = 1,
152    /// Arbitrary bytes (no encoding).
153    Binary = 2,
154}
155
156/// The internal representation of [`EncodedString`], separated out to prevent
157/// unchecked construction of `Es::Ascii(non_ascii_string)`.
158#[derive(Debug)]
159enum Es {
160    /// A string with ASCII content only.
161    Ascii(String),
162    /// A normal string encoded with valid UTF-8
163    Utf8(String),
164    /// A byte string which cannot be converted to UTF-8. Contains information
165    /// of failure.
166    Binary(FromUtf8Error),
167}
168
169/// A string together with its [`Encoding`].
170#[derive(Debug)]
171pub struct EncodedString(Es);
172
173impl EncodedString {
174    /// Obtains the raw bytes of this string.
175    pub fn as_bytes(&self) -> &[u8] {
176        match &self.0 {
177            Es::Ascii(s) | Es::Utf8(s) => s.as_bytes(),
178            Es::Binary(e) => e.as_bytes(),
179        }
180    }
181
182    /// Tries to convert this instance as a UTF-8 string.
183    ///
184    /// # Errors
185    ///
186    /// If this instance is not compatible with UTF-8, returns an error in the
187    /// same manner as [`std::str::from_utf8()`].
188    pub fn as_str(&self) -> Result<&str, Utf8Error> {
189        match &self.0 {
190            Es::Ascii(s) | Es::Utf8(s) => Ok(s),
191            Es::Binary(e) => Err(e.utf8_error()),
192        }
193    }
194
195    /// Returns the encoding of this string.
196    pub fn encoding(&self) -> Encoding {
197        match self.0 {
198            Es::Ascii(_) => Encoding::Ascii,
199            Es::Utf8(_) => Encoding::Utf8,
200            Es::Binary(_) => Encoding::Binary,
201        }
202    }
203}
204
205impl From<EncodedString> for Vec<u8> {
206    fn from(es: EncodedString) -> Self {
207        match es.0 {
208            Es::Ascii(s) | Es::Utf8(s) => s.into_bytes(),
209            Es::Binary(e) => e.into_bytes(),
210        }
211    }
212}
213
214impl From<Vec<u8>> for EncodedString {
215    fn from(b: Vec<u8>) -> Self {
216        match String::from_utf8(b) {
217            Ok(s) => Self::from(s),
218            Err(e) => Self(Es::Binary(e)),
219        }
220    }
221}
222
223impl From<String> for EncodedString {
224    fn from(s: String) -> Self {
225        Self(if s.is_ascii() {
226            Es::Ascii(s)
227        } else {
228            Es::Utf8(s)
229        })
230    }
231}
232
233impl TryFrom<EncodedString> for String {
234    type Error = FromUtf8Error;
235    fn try_from(es: EncodedString) -> Result<Self, Self::Error> {
236        match es.0 {
237            Es::Ascii(s) | Es::Utf8(s) => Ok(s),
238            Es::Binary(e) => Err(e),
239        }
240    }
241}
242
243impl PartialEq for EncodedString {
244    fn eq(&self, other: &Self) -> bool {
245        self.as_bytes() == other.as_bytes()
246    }
247}
248
249impl Eq for EncodedString {}
250
251impl PartialOrd for EncodedString {
252    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
253        Some(self.cmp(other))
254    }
255}
256
257impl Ord for EncodedString {
258    fn cmp(&self, other: &Self) -> Ordering {
259        self.as_bytes().cmp(other.as_bytes())
260    }
261}
262
263impl Hash for EncodedString {
264    fn hash<H: Hasher>(&self, state: &mut H) {
265        self.as_bytes().hash(state);
266    }
267}
268
269/// A random distribution which generates strings matching the specified regex.
270#[derive(Clone, Debug)]
271pub struct Regex {
272    compiled: Compiled,
273    capacity: usize,
274    encoding: Encoding,
275}
276
277impl Distribution<Vec<u8>> for Regex {
278    /// Samples a random byte string satisfying the regex.
279    fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Vec<u8> {
280        let mut ctx = EvalCtx {
281            output: Vec::with_capacity(self.capacity),
282            rng,
283        };
284        ctx.eval(&self.compiled);
285        ctx.output
286    }
287}
288
289impl Distribution<String> for Regex {
290    /// Samples a random string satisfying the regex.
291    ///
292    /// # Panics
293    ///
294    /// If the regex produced some non-UTF-8 byte sequence, this method will
295    /// panic. You may want to check [`is_utf8()`](Regex::is_utf8) to ensure the
296    /// regex will only generate valid Unicode strings, or sample a
297    /// `Result<String, FromUtf8Error>` and manually handle the error.
298    fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> String {
299        <Self as Distribution<Result<_, _>>>::sample(self, rng).unwrap()
300    }
301}
302
303impl Distribution<Result<String, FromUtf8Error>> for Regex {
304    /// Samples a random string satisfying the regex.
305    ///
306    /// The the sampled bytes sequence is not valid UTF-8, the sampling result
307    /// is an Err value.
308    fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Result<String, FromUtf8Error> {
309        let bytes = <Self as Distribution<Vec<u8>>>::sample(self, rng);
310        if self.is_utf8() {
311            unsafe { Ok(String::from_utf8_unchecked(bytes)) }
312        } else {
313            String::from_utf8(bytes)
314        }
315    }
316}
317
318impl Distribution<EncodedString> for Regex {
319    fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> EncodedString {
320        let result = <Self as Distribution<Result<_, _>>>::sample(self, rng);
321        EncodedString(match result {
322            Err(e) => Es::Binary(e),
323            Ok(s) => {
324                if self.is_ascii() || s.is_ascii() {
325                    Es::Ascii(s)
326                } else {
327                    Es::Utf8(s)
328                }
329            }
330        })
331    }
332}
333
334impl Default for Regex {
335    /// Creates an empty regex which generates empty strings.
336    ///
337    /// # Examples
338    ///
339    /// ```
340    /// use rand::Rng;
341    ///
342    /// let gen = rand_regex::Regex::default();
343    /// assert_eq!(rand::thread_rng().sample::<String, _>(&gen), "");
344    /// ```
345    #[inline]
346    fn default() -> Self {
347        Self {
348            compiled: Compiled::default(),
349            capacity: 0,
350            encoding: Encoding::Ascii,
351        }
352    }
353}
354
355impl Regex {
356    /// Obtains the narrowest string encoding this regex can produce.
357    pub const fn encoding(&self) -> Encoding {
358        self.encoding
359    }
360
361    /// Checks if the regex can only produce ASCII strings.
362    ///
363    /// # Examples
364    ///
365    /// ```
366    /// let ascii_gen = rand_regex::Regex::compile("[0-9]+", 100).unwrap();
367    /// assert_eq!(ascii_gen.is_ascii(), true);
368    /// let non_ascii_gen = rand_regex::Regex::compile(r"\d+", 100).unwrap();
369    /// assert_eq!(non_ascii_gen.is_ascii(), false);
370    /// ```
371    #[inline]
372    pub const fn is_ascii(&self) -> bool {
373        // FIXME remove the `as u8` once `PartialOrd` can be used in `const fn`.
374        (self.encoding as u8) == (Encoding::Ascii as u8)
375    }
376
377    /// Checks if the regex can only produce valid Unicode strings.
378    ///
379    /// Due to complexity of regex pattern, this method may have false
380    /// negative (returning false but still always produce valid UTF-8)
381    ///
382    /// # Examples
383    ///
384    /// ```
385    /// let utf8_hir = regex_syntax::ParserBuilder::new()
386    ///     .unicode(false)
387    ///     .utf8(false)
388    ///     .build()
389    ///     .parse(r"[\x00-\x7f]")
390    ///     .unwrap();
391    /// let utf8_gen = rand_regex::Regex::with_hir(utf8_hir, 100).unwrap();
392    /// assert_eq!(utf8_gen.is_utf8(), true);
393    ///
394    /// let non_utf8_hir = regex_syntax::ParserBuilder::new()
395    ///     .unicode(false)
396    ///     .utf8(false)
397    ///     .build()
398    ///     .parse(r"[\x00-\xff]")
399    ///     .unwrap();
400    /// let non_utf8_gen = rand_regex::Regex::with_hir(non_utf8_hir, 100).unwrap();
401    /// assert_eq!(non_utf8_gen.is_utf8(), false);
402    /// ```
403    #[inline]
404    pub const fn is_utf8(&self) -> bool {
405        // FIXME remove the `as u8` once `PartialOrd` can be used in `const fn`.
406        (self.encoding as u8) <= (Encoding::Utf8 as u8)
407    }
408
409    /// Returns the maximum length the string this regex can generate.
410    ///
411    /// # Examples
412    ///
413    /// ```
414    /// # #[cfg(feature = "unicode")] {
415    /// let gen = rand_regex::Regex::compile(r"\d{4}-\d{2}-\d{2}", 100).unwrap();
416    /// assert_eq!(gen.capacity(), 34);
417    /// // each `\d` can occupy 4 bytes
418    /// # }
419    /// ```
420    #[inline]
421    pub const fn capacity(&self) -> usize {
422        self.capacity
423    }
424
425    /// Compiles a regex pattern for string generation.
426    ///
427    /// If you need to supply additional flags to the pattern, please use
428    /// [`Regex::with_hir()`] instead.
429    ///
430    /// The `max_repeat` parameter gives the maximum extra repeat counts
431    /// the `x*`, `x+` and `x{n,}` operators will become, e.g.
432    ///
433    /// ```
434    /// let gen = rand_regex::Regex::compile("a{4,}", 100).unwrap();
435    /// // this will generate a string between 4 to 104 characters long.
436    /// assert_eq!(gen.capacity(), 104);
437    /// ```
438    ///
439    /// # Errors
440    ///
441    /// Returns an error if the pattern is not valid regex, or contains anchors
442    /// (`^`, `$`, `\A`, `\z`) or word boundary assertions (`\b`, `\B`).
443    pub fn compile(pattern: &str, max_repeat: u32) -> Result<Self, Error> {
444        let hir = Parser::new().parse(pattern)?;
445        Self::with_hir(hir, max_repeat)
446    }
447
448    /// Compiles a parsed regex pattern for string generation.
449    ///
450    /// The [`Hir`] object can be obtained using [`regex_syntax::ParserBuilder`].
451    ///
452    /// The `max_repeat` parameter gives the maximum extra repeat counts
453    /// the `x*`, `x+` and `x{n,}` operators will become.
454    ///
455    /// # Errors
456    ///
457    /// Returns an error if the `Hir` object contains anchors (`^`, `$`, `\A`,
458    /// `\z`) or word boundary assertions (`\b`, `\B`).
459    pub fn with_hir(hir: Hir, max_repeat: u32) -> Result<Self, Error> {
460        match hir.into_kind() {
461            HirKind::Empty => Ok(Self::default()),
462            HirKind::Look(_) => Err(Error::Anchor),
463            HirKind::Capture(hir::Capture { sub, .. }) => Self::with_hir(*sub, max_repeat),
464
465            HirKind::Literal(hir::Literal(bytes)) => Ok(Self::with_bytes_literal(bytes.into())),
466            HirKind::Class(hir::Class::Unicode(class)) => Self::with_unicode_class(&class),
467            HirKind::Class(hir::Class::Bytes(class)) => Self::with_byte_class(&class),
468            HirKind::Repetition(rep) => Self::with_repetition(rep, max_repeat),
469            HirKind::Concat(hirs) => Self::with_sequence(hirs, max_repeat),
470            HirKind::Alternation(hirs) => Self::with_choices(hirs, max_repeat),
471        }
472    }
473
474    fn with_bytes_literal(bytes: Vec<u8>) -> Self {
475        let es = EncodedString::from(bytes);
476        let encoding = es.encoding();
477        let bytes = Vec::from(es);
478        Self {
479            capacity: bytes.len(),
480            compiled: Kind::Literal(bytes).into(),
481            encoding,
482        }
483    }
484
485    fn with_unicode_class(class: &ClassUnicode) -> Result<Self, Error> {
486        Ok(if let Some(byte_class) = class.to_byte_class() {
487            Self::with_byte_class(&byte_class)?
488        } else {
489            Self {
490                compiled: compile_unicode_class(class.ranges())?.into(),
491                capacity: class.maximum_len().unwrap_or(0),
492                encoding: Encoding::Utf8,
493            }
494        })
495    }
496
497    fn with_byte_class(class: &ClassBytes) -> Result<Self, Error> {
498        Ok(Self {
499            compiled: Kind::ByteClass(ByteClass::compile(class.ranges())?).into(),
500            capacity: 1,
501            encoding: if class.is_ascii() {
502                Encoding::Ascii
503            } else {
504                Encoding::Binary
505            },
506        })
507    }
508
509    fn with_repetition(rep: Repetition, max_repeat: u32) -> Result<Self, Error> {
510        let lower = rep.min;
511        let upper = rep.max.unwrap_or(lower + max_repeat);
512
513        // simplification: `(<any>){0}` is always empty.
514        if upper == 0 {
515            return Ok(Self::default());
516        }
517
518        let mut regex = Self::with_hir(*rep.sub, max_repeat)?;
519        regex.capacity *= upper as usize;
520        if lower == upper {
521            regex.compiled.repeat_const *= upper;
522        } else {
523            regex
524                .compiled
525                .repeat_ranges
526                .push(Uniform::new_inclusive(lower, upper).map_err(|_| Error::Unsatisfiable)?);
527        }
528
529        // simplification: if the inner is an literal, replace `x{3}` by `xxx`.
530        if let Kind::Literal(lit) = &mut regex.compiled.kind {
531            if regex.compiled.repeat_const > 1 {
532                *lit = lit.repeat(regex.compiled.repeat_const as usize);
533                regex.compiled.repeat_const = 1;
534            }
535        }
536
537        Ok(regex)
538    }
539
540    fn with_sequence(hirs: Vec<Hir>, max_repeat: u32) -> Result<Self, Error> {
541        let mut seq = Vec::with_capacity(hirs.len());
542        let mut capacity = 0;
543        let mut encoding = Encoding::Ascii;
544
545        for hir in hirs {
546            let regex = Self::with_hir(hir, max_repeat)?;
547            capacity += regex.capacity;
548            encoding = encoding.max(regex.encoding);
549            let compiled = regex.compiled;
550            if compiled.is_single() {
551                // simplification: `x(yz)` = `xyz`
552                if let Kind::Sequence(mut s) = compiled.kind {
553                    seq.append(&mut s);
554                    continue;
555                }
556            }
557            seq.push(compiled);
558        }
559
560        // Further simplify by merging adjacent literals.
561        let mut simplified = Vec::with_capacity(seq.len());
562        let mut combined_lit = Vec::new();
563        for cur in seq {
564            if cur.is_single() {
565                if let Kind::Literal(mut lit) = cur.kind {
566                    combined_lit.append(&mut lit);
567                    continue;
568                }
569            }
570            if !combined_lit.is_empty() {
571                simplified.push(Kind::Literal(mem::take(&mut combined_lit)).into());
572            }
573            simplified.push(cur);
574        }
575
576        if !combined_lit.is_empty() {
577            simplified.push(Kind::Literal(combined_lit).into());
578        }
579
580        let compiled = match simplified.len() {
581            0 => return Ok(Self::default()),
582            1 => simplified.swap_remove(0),
583            _ => Kind::Sequence(simplified).into(),
584        };
585
586        Ok(Self {
587            compiled,
588            capacity,
589            encoding,
590        })
591    }
592
593    fn with_choices(hirs: Vec<Hir>, max_repeat: u32) -> Result<Self, Error> {
594        let mut choices = Vec::with_capacity(hirs.len());
595        let mut capacity = 0;
596        let mut encoding = Encoding::Ascii;
597        for hir in hirs {
598            let regex = Self::with_hir(hir, max_repeat)?;
599            if regex.capacity > capacity {
600                capacity = regex.capacity;
601            }
602            encoding = encoding.max(regex.encoding);
603
604            let compiled = regex.compiled;
605            if compiled.is_single() {
606                if let Kind::Any {
607                    choices: mut sc, ..
608                } = compiled.kind
609                {
610                    choices.append(&mut sc);
611                    continue;
612                }
613            }
614            choices.push(compiled);
615        }
616        Ok(Self {
617            compiled: Kind::Any {
618                index: Uniform::new(0, choices.len()).map_err(|_| Error::Unsatisfiable)?,
619                choices,
620            }
621            .into(),
622            capacity,
623            encoding,
624        })
625    }
626}
627
628/// Represents a compiled regex component.
629#[derive(Clone, Debug)]
630struct Compiled {
631    // Constant part of repetition.
632    repeat_const: u32,
633    // Variable parts of repetition. The repeats are multiplied together.
634    repeat_ranges: Vec<Uniform<u32>>,
635    // Kind of atomic regex component.
636    kind: Kind,
637}
638
639impl Default for Compiled {
640    fn default() -> Self {
641        Kind::default().into()
642    }
643}
644
645impl Compiled {
646    /// Returns whether this component has no repetition.
647    fn is_single(&self) -> bool {
648        self.repeat_const == 1 && self.repeat_ranges.is_empty()
649    }
650}
651
652#[derive(Clone, Debug)]
653enum Kind {
654    Literal(Vec<u8>),
655    Sequence(Vec<Compiled>),
656    Any {
657        index: Uniform<usize>,
658        choices: Vec<Compiled>,
659    },
660    LongUnicodeClass(LongUnicodeClass),
661    ShortUnicodeClass(ShortUnicodeClass),
662    ByteClass(ByteClass),
663}
664
665impl Default for Kind {
666    fn default() -> Self {
667        Self::Literal(Vec::new())
668    }
669}
670
671impl From<Kind> for Compiled {
672    fn from(kind: Kind) -> Self {
673        Self {
674            repeat_const: 1,
675            repeat_ranges: Vec::new(),
676            kind,
677        }
678    }
679}
680
681struct EvalCtx<'a, R: ?Sized + 'a> {
682    output: Vec<u8>,
683    rng: &'a mut R,
684}
685
686impl<'a, R: Rng + ?Sized + 'a> EvalCtx<'a, R> {
687    fn eval(&mut self, compiled: &Compiled) {
688        let count = compiled
689            .repeat_ranges
690            .iter()
691            .fold(compiled.repeat_const, |c, u| c * u.sample(self.rng));
692
693        match &compiled.kind {
694            Kind::Literal(lit) => self.eval_literal(count, lit),
695            Kind::Sequence(seq) => self.eval_sequence(count, seq),
696            Kind::Any { index, choices } => self.eval_alt(count, index, choices),
697            Kind::LongUnicodeClass(class) => self.eval_unicode_class(count, class),
698            Kind::ShortUnicodeClass(class) => self.eval_unicode_class(count, class),
699            Kind::ByteClass(class) => self.eval_byte_class(count, class),
700        }
701    }
702
703    fn eval_literal(&mut self, count: u32, lit: &[u8]) {
704        for _ in 0..count {
705            self.output.extend_from_slice(lit);
706        }
707    }
708
709    fn eval_sequence(&mut self, count: u32, seq: &[Compiled]) {
710        for _ in 0..count {
711            for compiled in seq {
712                self.eval(compiled);
713            }
714        }
715    }
716
717    fn eval_alt(&mut self, count: u32, index: &Uniform<usize>, choices: &[Compiled]) {
718        for _ in 0..count {
719            let idx = index.sample(self.rng);
720            self.eval(&choices[idx]);
721        }
722    }
723
724    fn eval_unicode_class(&mut self, count: u32, class: &impl Distribution<char>) {
725        let mut buf = [0; 4];
726        for c in class.sample_iter(&mut self.rng).take(count as usize) {
727            let bytes = c.encode_utf8(&mut buf).as_bytes();
728            self.output.extend_from_slice(bytes);
729        }
730    }
731
732    fn eval_byte_class(&mut self, count: u32, class: &ByteClass) {
733        self.output
734            .extend(self.rng.sample_iter(class).take(count as usize));
735    }
736}
737
738/// A compiled Unicode class of more than 64 ranges.
739#[derive(Clone, Debug)]
740struct LongUnicodeClass {
741    searcher: Uniform<u32>,
742    ranges: Box<[(u32, u32)]>,
743}
744
745impl Distribution<char> for LongUnicodeClass {
746    fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> char {
747        let normalized_index = self.searcher.sample(rng);
748        let entry_index = self
749            .ranges
750            .binary_search_by(|(normalized_start, _)| normalized_start.cmp(&normalized_index))
751            .unwrap_or_else(|e| e - 1);
752        let code = normalized_index + self.ranges[entry_index].1;
753        char::from_u32(code).expect("valid char")
754    }
755}
756
757/// A compiled Unicode class of less than or equals to 64 ranges.
758#[derive(Clone, Debug)]
759struct ShortUnicodeClass {
760    index: Uniform<usize>,
761    cases: Box<[char]>,
762}
763
764impl Distribution<char> for ShortUnicodeClass {
765    fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> char {
766        self.cases[self.index.sample(rng)]
767    }
768}
769
770fn compile_unicode_class_with(ranges: &[hir::ClassUnicodeRange], mut push: impl FnMut(char, char)) {
771    for range in ranges {
772        let start = range.start();
773        let end = range.end();
774        if start <= '\u{d7ff}' && '\u{e000}' <= end {
775            push(start, '\u{d7ff}');
776            push('\u{e000}', end);
777        } else {
778            push(start, end);
779        }
780    }
781}
782
783fn compile_unicode_class(ranges: &[hir::ClassUnicodeRange]) -> Result<Kind, Error> {
784    let mut normalized_ranges = Vec::new();
785    let mut normalized_len = 0;
786    compile_unicode_class_with(ranges, |start, end| {
787        let start = u32::from(start);
788        let end = u32::from(end);
789        normalized_ranges.push((normalized_len, start - normalized_len));
790        normalized_len += end - start + 1;
791    });
792
793    if normalized_len as usize > SHORT_UNICODE_CLASS_COUNT {
794        return Ok(Kind::LongUnicodeClass(LongUnicodeClass {
795            searcher: Uniform::new(0, normalized_len).map_err(|_| Error::Unsatisfiable)?,
796            ranges: normalized_ranges.into_boxed_slice(),
797        }));
798    }
799
800    // the number of cases is too small. convert into a direct search array.
801    let mut cases = Vec::with_capacity(normalized_len as usize);
802    compile_unicode_class_with(ranges, |start, end| {
803        for c in u32::from(start)..=u32::from(end) {
804            cases.push(char::from_u32(c).expect("valid char"));
805        }
806    });
807
808    Ok(Kind::ShortUnicodeClass(ShortUnicodeClass {
809        index: Uniform::new(0, cases.len()).map_err(|_| Error::Unsatisfiable)?,
810        cases: cases.into_boxed_slice(),
811    }))
812}
813
814/// A compiled byte class.
815#[derive(Clone, Debug)]
816struct ByteClass {
817    index: Uniform<usize>,
818    cases: Box<[u8]>,
819}
820
821impl ByteClass {
822    fn compile(ranges: &[hir::ClassBytesRange]) -> Result<Self, Error> {
823        let mut cases = Vec::with_capacity(256);
824        for range in ranges {
825            cases.extend(range.start()..=range.end());
826        }
827        Ok(Self {
828            index: Uniform::new(0, cases.len()).map_err(|_| Error::Unsatisfiable)?,
829            cases: cases.into_boxed_slice(),
830        })
831    }
832}
833
834impl Distribution<u8> for ByteClass {
835    fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> u8 {
836        self.cases[self.index.sample(rng)]
837    }
838}
839
840#[cfg(test)]
841mod test {
842    use super::*;
843    use rand::rng as thread_rng;
844    use std::collections::HashSet;
845    use std::ops::RangeInclusive;
846
847    fn check_str(
848        pattern: &str,
849        encoding: Encoding,
850        distinct_count: RangeInclusive<usize>,
851        run_count: usize,
852    ) {
853        let r = regex::Regex::new(pattern).unwrap();
854        let gen = Regex::compile(pattern, 100).unwrap();
855        assert!(gen.is_utf8());
856        assert_eq!(gen.encoding(), encoding);
857
858        let mut rng = thread_rng();
859
860        let mut gen_set = HashSet::<String>::with_capacity(run_count.min(*distinct_count.end()));
861        for res in (&gen).sample_iter(&mut rng).take(run_count) {
862            let res: String = res;
863            assert!(res.len() <= gen.capacity());
864            assert!(
865                r.is_match(&res),
866                "Wrong sample for pattern `{}`: `{}`",
867                pattern,
868                res
869            );
870            gen_set.insert(res);
871        }
872        let gen_count = gen_set.len();
873        assert!(
874            *distinct_count.start() <= gen_count && gen_count <= *distinct_count.end(),
875            "Distinct samples generated for pattern `{}` outside the range {:?}: {} (examples:\n{})",
876            pattern,
877            distinct_count,
878            gen_count,
879            gen_set.iter().take(10).map(|s| format!(" - {:#?}\n", s)).collect::<String>(),
880        );
881    }
882
883    fn run_count_for_distinct_count(distinct_count: usize) -> usize {
884        // Suppose a regex can possibly generate N distinct strings uniformly. What is the
885        // probability distribution of number of distinct strings R we can get by running the
886        // generator M times?
887        //
888        // Assuming we can afford M ≫ N ≈ R, we could find out the probability which (N - R) strings
889        // are still *not* generated after M iterations, which is P = (1 - (R/N)^M)^(Binomial[N,R])
890        // ≈ 1 - Binomial[N,R] * (R/N)^M.
891        //
892        // Here we choose the lower bound as R+1 after solving M for P > 0.999999, or:
893        //
894        //  Binomial[N,R] * (R/N)^M < 10^(-6)
895        //
896        // We limit M ≤ 4096 to keep the test time short.
897
898        if distinct_count <= 1 {
899            return 8;
900        }
901        let n = distinct_count as f64;
902        ((n.ln() + 6.0 * std::f64::consts::LN_10) / (n.ln() - (n - 1.0).ln())).ceil() as usize
903    }
904
905    #[test]
906    fn sanity_test_run_count() {
907        assert_eq!(run_count_for_distinct_count(1), 8);
908        assert_eq!(run_count_for_distinct_count(2), 21);
909        assert_eq!(run_count_for_distinct_count(3), 37);
910        assert_eq!(run_count_for_distinct_count(10), 153);
911        assert_eq!(run_count_for_distinct_count(26), 436);
912        assert_eq!(run_count_for_distinct_count(62), 1104);
913        assert_eq!(run_count_for_distinct_count(128), 2381);
914        assert_eq!(run_count_for_distinct_count(214), 4096);
915    }
916
917    fn check_str_limited(pattern: &str, encoding: Encoding, distinct_count: usize) {
918        let run_count = run_count_for_distinct_count(distinct_count);
919        check_str(
920            pattern,
921            encoding,
922            distinct_count..=distinct_count,
923            run_count,
924        );
925    }
926
927    fn check_str_unlimited(pattern: &str, encoding: Encoding, min_distinct_count: usize) {
928        check_str(pattern, encoding, min_distinct_count..=4096, 4096);
929    }
930
931    #[test]
932    fn test_proptest() {
933        check_str_limited("foo", Encoding::Ascii, 1);
934        check_str_limited("foo|bar|baz", Encoding::Ascii, 3);
935        check_str_limited("a{0,8}", Encoding::Ascii, 9);
936        check_str_limited("a?", Encoding::Ascii, 2);
937        check_str_limited("a*", Encoding::Ascii, 101);
938        check_str_limited("a+", Encoding::Ascii, 101);
939        check_str_limited("a{4,}", Encoding::Ascii, 101);
940        check_str_limited("(foo|bar)(xyzzy|plugh)", Encoding::Ascii, 4);
941        check_str_unlimited(".", Encoding::Utf8, 4075);
942        check_str_unlimited("(?s).", Encoding::Utf8, 4075);
943    }
944
945    #[test]
946    fn test_regex_generate() {
947        check_str_limited("", Encoding::Ascii, 1);
948        check_str_limited("aBcDe", Encoding::Ascii, 1);
949        check_str_limited("[a-zA-Z0-9]", Encoding::Ascii, 62);
950        check_str_limited("a{3,8}", Encoding::Ascii, 6);
951        check_str_limited("a{3}", Encoding::Ascii, 1);
952        check_str_limited("a{3}-a{3}", Encoding::Ascii, 1);
953        check_str_limited("(abcde)", Encoding::Ascii, 1);
954        check_str_limited("a?b?", Encoding::Ascii, 4);
955    }
956
957    #[test]
958    #[cfg(feature = "unicode")]
959    fn test_unicode_cases() {
960        check_str_limited("(?i:fOo)", Encoding::Ascii, 8);
961        check_str_limited("(?i:a|B)", Encoding::Ascii, 4);
962        check_str_unlimited(r"(\p{Greek}\P{Greek})(?:\d{3,6})", Encoding::Utf8, 4096);
963    }
964
965    #[test]
966    fn test_ascii_character_classes() {
967        check_str_limited("[[:alnum:]]", Encoding::Ascii, 62);
968        check_str_limited("[[:alpha:]]", Encoding::Ascii, 52);
969        check_str_limited("[[:ascii:]]", Encoding::Ascii, 128);
970        check_str_limited("[[:blank:]]", Encoding::Ascii, 2);
971        check_str_limited("[[:cntrl:]]", Encoding::Ascii, 33);
972        check_str_limited("[[:digit:]]", Encoding::Ascii, 10);
973        check_str_limited("[[:graph:]]", Encoding::Ascii, 94);
974        check_str_limited("[[:lower:]]", Encoding::Ascii, 26);
975        check_str_limited("[[:print:]]", Encoding::Ascii, 95);
976        check_str_limited("[[:punct:]]", Encoding::Ascii, 32);
977        check_str_limited("[[:space:]]", Encoding::Ascii, 6);
978        check_str_limited("[[:upper:]]", Encoding::Ascii, 26);
979        check_str_limited("[[:word:]]", Encoding::Ascii, 63);
980        check_str_limited("[[:xdigit:]]", Encoding::Ascii, 22);
981    }
982
983    #[test]
984    #[cfg(feature = "unicode")]
985    fn sanity_test_unicode_character_classes_size() {
986        // This test records the number of characters in each unicode class.
987        // If any of these test failed, please:
988        //  1. update the RHS of the numbers
989        //  2. increase the revision number of the regex-syntax dependency
990        //  3. update the relevant ranges in the test_unicode_* functions.
991        //
992        // (for easy reference, there are 1_112_064 assignable code points)
993
994        fn count_class_chars(pattern: &str) -> usize {
995            use regex_syntax::{
996                hir::{Class, HirKind},
997                parse,
998            };
999
1000            let hir = parse(pattern).unwrap();
1001            let HirKind::Class(Class::Unicode(cls)) = hir.into_kind() else {
1002                unreachable!()
1003            };
1004            // we assume all positive unicode classes do not cover the surrogate range.
1005            // otherwise `r.len()` is wrong.
1006            cls.iter().map(|r| r.len()).sum()
1007        }
1008
1009        assert_eq!(count_class_chars(r"\p{L}"), 141_028);
1010        assert_eq!(count_class_chars(r"\p{M}"), 2_501);
1011        assert_eq!(count_class_chars(r"\p{N}"), 1_911);
1012        assert_eq!(count_class_chars(r"\p{P}"), 855);
1013        assert_eq!(count_class_chars(r"\p{S}"), 8_514);
1014        assert_eq!(count_class_chars(r"\p{Z}"), 19);
1015        assert_eq!(count_class_chars(r"\p{C}"), 959_284);
1016
1017        assert_eq!(count_class_chars(r"\p{Latin}"), 1_487);
1018        assert_eq!(count_class_chars(r"\p{Greek}"), 518);
1019        assert_eq!(count_class_chars(r"\p{Cyrillic}"), 508);
1020        assert_eq!(count_class_chars(r"\p{Armenian}"), 96);
1021        assert_eq!(count_class_chars(r"\p{Hebrew}"), 134);
1022        assert_eq!(count_class_chars(r"\p{Arabic}"), 1_373);
1023        assert_eq!(count_class_chars(r"\p{Syriac}"), 88);
1024        assert_eq!(count_class_chars(r"\p{Thaana}"), 50);
1025        assert_eq!(count_class_chars(r"\p{Devanagari}"), 164);
1026        assert_eq!(count_class_chars(r"\p{Bengali}"), 96);
1027        assert_eq!(count_class_chars(r"\p{Gurmukhi}"), 80);
1028        assert_eq!(count_class_chars(r"\p{Gujarati}"), 91);
1029        assert_eq!(count_class_chars(r"\p{Oriya}"), 91);
1030        assert_eq!(count_class_chars(r"\p{Tamil}"), 123);
1031        assert_eq!(count_class_chars(r"\p{Hangul}"), 11_739);
1032        assert_eq!(count_class_chars(r"\p{Hiragana}"), 381);
1033        assert_eq!(count_class_chars(r"\p{Katakana}"), 321);
1034        assert_eq!(count_class_chars(r"\p{Han}"), 99_030);
1035        assert_eq!(count_class_chars(r"\p{Tagalog}"), 23);
1036        assert_eq!(count_class_chars(r"\p{Linear_B}"), 211);
1037        assert_eq!(count_class_chars(r"\p{Inherited}"), 657);
1038
1039        assert_eq!(count_class_chars(r"\d"), 760);
1040        assert_eq!(count_class_chars(r"\s"), 25);
1041        assert_eq!(count_class_chars(r"\w"), 144_667);
1042    }
1043
1044    #[test]
1045    #[cfg(feature = "unicode")]
1046    fn test_unicode_character_classes() {
1047        // The range describes the number of distinct strings we can get from the regex.
1048        //
1049        // Suppose the class has M members. If we randomly pick N items out of it with duplicates,
1050        // the chance that there are K distinct members is the classical occupancy distibution[1]:
1051        //
1052        //      Occ(K|N,M) = (S2(N,K) * M!) / ((M-K)! * M^N)
1053        //
1054        // where S2(N,K) are the Stirling numbers of the second kind.
1055        //
1056        // This distribution has mean and variance of
1057        //
1058        //      μ  = M * (1 - (1 - 1/M)^N)
1059        //      σ² = M * ((M(M-1))^N + (M-1)(M(M-2))^N - M(M-1)^(2N)) / M^(2N)
1060        //
1061        // which we can use to approximate as a normal distrubition and calculate the CDF to find
1062        // out the 0.0001% percentile as the lower bound of K.
1063        //
1064        // (The upper bound should always be M, the 100% percentile.)
1065        //
1066        // The Mathematica code to compute the lower bound of K is:
1067        //
1068        //      getInterval[m_, n_] := Block[{
1069        //          mean = m(1-(1-1/m)^n),
1070        //          var = m((m(m-1))^n+(m-1)(m(m-2))^n-m(m-1)^(2n))/m^(2n)
1071        //      }, InverseCDF[NormalDistribution[mean, Sqrt[var]], 1*^-6]
1072        //
1073        //      (* Usage: getInterval[2450, 4096.0`32] *)
1074        //
1075        // [1]: https://doi.org/10.1080/00031305.2019.1699445
1076
1077        check_str_unlimited(r"\p{L}", Encoding::Utf8, 3999);
1078        check_str(r"\p{M}", Encoding::Utf8, 1941..=2501, 4096);
1079        check_str(r"\p{N}", Encoding::Utf8, 1630..=1911, 4096);
1080        check_str(r"\p{P}", Encoding::Utf8, 835..=855, 4096);
1081        check_str_unlimited(r"\p{S}", Encoding::Utf8, 3151);
1082        check_str_limited(r"\p{Z}", Encoding::Utf8, 19);
1083        check_str_unlimited(r"\p{C}", Encoding::Utf8, 4073);
1084
1085        check_str_unlimited(r"\P{L}", Encoding::Utf8, 4073);
1086        check_str_unlimited(r"\P{M}", Encoding::Utf8, 4075);
1087        check_str_unlimited(r"\P{N}", Encoding::Utf8, 4075);
1088        check_str_unlimited(r"\P{P}", Encoding::Utf8, 4075);
1089        check_str_unlimited(r"\P{S}", Encoding::Utf8, 4075);
1090        check_str_unlimited(r"\P{Z}", Encoding::Utf8, 4075);
1091        check_str_unlimited(r"\P{C}", Encoding::Utf8, 4007);
1092    }
1093
1094    #[test]
1095    #[cfg(feature = "unicode")]
1096    fn test_unicode_script_classes() {
1097        check_str(r"\p{Latin}", Encoding::Utf8, 1352..=1487, 4096);
1098        check_str(r"\p{Greek}", Encoding::Utf8, 516..=518, 4096);
1099        check_str(r"\p{Cyrillic}", Encoding::Utf8, 506..=508, 4096);
1100        check_str_limited(r"\p{Armenian}", Encoding::Utf8, 96);
1101        check_str_limited(r"\p{Hebrew}", Encoding::Utf8, 134);
1102        check_str(r"\p{Arabic}", Encoding::Utf8, 1268..=1373, 4096);
1103        check_str_limited(r"\p{Syriac}", Encoding::Utf8, 88);
1104        check_str_limited(r"\p{Thaana}", Encoding::Utf8, 50);
1105        check_str_limited(r"\p{Devanagari}", Encoding::Utf8, 164);
1106        check_str_limited(r"\p{Bengali}", Encoding::Utf8, 96);
1107        check_str_limited(r"\p{Gurmukhi}", Encoding::Utf8, 80);
1108        check_str_limited(r"\p{Gujarati}", Encoding::Utf8, 91);
1109        check_str_limited(r"\p{Oriya}", Encoding::Utf8, 91);
1110        check_str_limited(r"\p{Tamil}", Encoding::Utf8, 123);
1111        check_str_unlimited(r"\p{Hangul}", Encoding::Utf8, 3363);
1112        check_str_limited(r"\p{Hiragana}", Encoding::Utf8, 381);
1113        check_str_limited(r"\p{Katakana}", Encoding::Utf8, 321);
1114        check_str_unlimited(r"\p{Han}", Encoding::Utf8, 3970);
1115        check_str_limited(r"\p{Tagalog}", Encoding::Utf8, 23);
1116        check_str_limited(r"\p{Linear_B}", Encoding::Utf8, 211);
1117        check_str(r"\p{Inherited}", Encoding::Utf8, 650..=657, 4096);
1118    }
1119
1120    #[test]
1121    #[cfg(feature = "unicode")]
1122    fn test_perl_classes() {
1123        check_str_unlimited(r"\d+", Encoding::Utf8, 4061);
1124        check_str_unlimited(r"\D+", Encoding::Utf8, 4096);
1125        check_str_unlimited(r"\s+", Encoding::Utf8, 4014);
1126        check_str_unlimited(r"\S+", Encoding::Utf8, 4096);
1127        check_str_unlimited(r"\w+", Encoding::Utf8, 4095);
1128        check_str_unlimited(r"\W+", Encoding::Utf8, 4096);
1129    }
1130
1131    #[cfg(any())]
1132    fn dump_categories() {
1133        use regex_syntax::hir::*;
1134
1135        let categories = &[r"\p{Nd}", r"\p{Greek}"];
1136
1137        for cat in categories {
1138            if let HirKind::Class(Class::Unicode(cls)) =
1139                regex_syntax::Parser::new().parse(cat).unwrap().into_kind()
1140            {
1141                let s: u32 = cls
1142                    .iter()
1143                    .map(|r| u32::from(r.end()) - u32::from(r.start()) + 1)
1144                    .sum();
1145                println!("{} => {}", cat, s);
1146            }
1147        }
1148    }
1149
1150    #[test]
1151    fn test_binary_generator() {
1152        const PATTERN: &str = r"PE\x00\x00.{20}";
1153
1154        let r = regex::bytes::RegexBuilder::new(PATTERN)
1155            .unicode(false)
1156            .dot_matches_new_line(true)
1157            .build()
1158            .unwrap();
1159
1160        let hir = regex_syntax::ParserBuilder::new()
1161            .unicode(false)
1162            .dot_matches_new_line(true)
1163            .utf8(false)
1164            .build()
1165            .parse(PATTERN)
1166            .unwrap();
1167
1168        let gen = Regex::with_hir(hir, 100).unwrap();
1169        assert_eq!(gen.capacity(), 24);
1170        assert!(!gen.is_utf8());
1171        assert_eq!(gen.encoding(), Encoding::Binary);
1172
1173        let mut rng = thread_rng();
1174        for res in gen.sample_iter(&mut rng).take(8192) {
1175            let res: Vec<u8> = res;
1176            assert!(r.is_match(&res), "Wrong sample: {:?}, `{:?}`", r, res);
1177        }
1178    }
1179
1180    #[test]
1181    fn test_encoding_generator_1() {
1182        let hir = regex_syntax::ParserBuilder::new()
1183            .unicode(false)
1184            .utf8(false)
1185            .build()
1186            .parse(r"[\x00-\xff]{2}")
1187            .unwrap();
1188        let gen = Regex::with_hir(hir, 100).unwrap();
1189
1190        // This pattern will produce:
1191        //  - 16384 ASCII patterns (128^2)
1192        //  -  1920 UTF-8 patterns (30 * 64)
1193        //  - 47232 binary patterns (256^2 - 16384 - 1920)
1194
1195        let mut encoding_counts = [0; 3];
1196        let mut rng = thread_rng();
1197        for encoded_string in gen.sample_iter(&mut rng).take(8192) {
1198            let encoded_string: EncodedString = encoded_string;
1199            let bytes = encoded_string.as_bytes();
1200            let encoding = encoded_string.encoding();
1201            assert_eq!(bytes.len(), 2);
1202            if bytes.is_ascii() {
1203                assert_eq!(encoding, Encoding::Ascii);
1204            } else if std::str::from_utf8(bytes).is_ok() {
1205                assert_eq!(encoding, Encoding::Utf8);
1206            } else {
1207                assert_eq!(encoding, Encoding::Binary);
1208            }
1209            encoding_counts[encoding as usize] += 1;
1210        }
1211
1212        // the following ranges are 99.9999% confidence intervals of the multinomial distribution.
1213        assert!((1858..2243).contains(&encoding_counts[Encoding::Ascii as usize]));
1214        assert!((169..319).contains(&encoding_counts[Encoding::Utf8 as usize]));
1215        assert!((5704..6102).contains(&encoding_counts[Encoding::Binary as usize]));
1216    }
1217
1218    #[test]
1219    fn test_encoding_generator_2() {
1220        let gen = Regex::compile(r"[\u{0}-\u{b5}]{2}", 100).unwrap();
1221
1222        // This pattern will produce 32761 distinct outputs, with:
1223        //  - 16384 ASCII patterns
1224        //  - 16377 UTF-8 patterns
1225
1226        let mut encoding_counts = [0; 2];
1227        let mut rng = thread_rng();
1228        for encoded_string in gen.sample_iter(&mut rng).take(8192) {
1229            let encoded_string: EncodedString = encoded_string;
1230            let encoding = encoded_string.encoding();
1231            let string = encoded_string.as_str().unwrap();
1232            assert_eq!(string.chars().count(), 2);
1233            if string.is_ascii() {
1234                assert_eq!(encoding, Encoding::Ascii);
1235                assert_eq!(string.len(), 2);
1236            } else {
1237                assert_eq!(encoding, Encoding::Utf8);
1238            }
1239            encoding_counts[encoding as usize] += 1;
1240        }
1241
1242        // the following ranges are 99.9999% confidence intervals of the multinomial distribution.
1243        assert!((3876..4319).contains(&encoding_counts[Encoding::Ascii as usize]));
1244        assert!((3874..4317).contains(&encoding_counts[Encoding::Utf8 as usize]));
1245    }
1246
1247    #[test]
1248    fn test_encoding_generator_3() {
1249        let gen = Regex::compile(r"[\u{0}-\u{7f}]{2}", 100).unwrap();
1250        let mut rng = thread_rng();
1251        for encoded_string in gen.sample_iter(&mut rng).take(8192) {
1252            let encoded_string: EncodedString = encoded_string;
1253            assert_eq!(encoded_string.encoding(), Encoding::Ascii);
1254            assert_eq!(String::try_from(encoded_string).unwrap().len(), 2);
1255        }
1256    }
1257
1258    #[test]
1259    #[should_panic(expected = "FromUtf8Error")]
1260    fn test_generating_non_utf8_string() {
1261        let hir = regex_syntax::ParserBuilder::new()
1262            .unicode(false)
1263            .utf8(false)
1264            .build()
1265            .parse(r"\x88")
1266            .unwrap();
1267
1268        let gen = Regex::with_hir(hir, 100).unwrap();
1269        assert!(!gen.is_utf8());
1270        assert_eq!(gen.encoding(), Encoding::Binary);
1271
1272        let mut rng = thread_rng();
1273        let _: String = rng.sample(&gen);
1274    }
1275
1276    #[test]
1277    fn unsatisfiable_char_class_intersection() {
1278        assert!(matches!(
1279            Regex::compile("[a&&b]", 100),
1280            Err(Error::Unsatisfiable)
1281        ));
1282    }
1283}