1#![warn(missing_docs, clippy::pedantic)]
2
3#![allow(clippy::must_use_candidate)]
38
39use rand::{
40 distr::{uniform::Uniform, Distribution},
41 Rng,
42};
43use regex_syntax::{
44 hir::{self, ClassBytes, ClassUnicode, Hir, HirKind, Repetition},
45 Parser,
46};
47use std::{
48 char,
49 cmp::Ordering,
50 error,
51 fmt::{self, Debug},
52 hash::{Hash, Hasher},
53 mem,
54 str::Utf8Error,
55 string::FromUtf8Error,
56};
57
58const SHORT_UNICODE_CLASS_COUNT: usize = 64;
59
60#[derive(Debug, Clone, Eq, PartialEq)]
69pub enum Error {
70 Anchor,
93
94 Syntax(Box<regex_syntax::Error>),
106
107 Unsatisfiable,
117}
118
119impl fmt::Display for Error {
120 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
121 match self {
122 Self::Unsatisfiable => f.write_str("regex is unsatisfiable"),
123 Self::Anchor => f.write_str("anchor is not supported"),
124 Self::Syntax(e) => fmt::Display::fmt(e, f),
125 }
126 }
127}
128
129impl error::Error for Error {
130 fn source(&self) -> Option<&(dyn error::Error + 'static)> {
131 match self {
132 Self::Unsatisfiable => None,
133 Self::Anchor => None,
134 Self::Syntax(e) => Some(e),
135 }
136 }
137}
138
139impl From<regex_syntax::Error> for Error {
140 fn from(e: regex_syntax::Error) -> Self {
141 Self::Syntax(Box::new(e))
142 }
143}
144
145#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
147pub enum Encoding {
148 Ascii = 0,
150 Utf8 = 1,
152 Binary = 2,
154}
155
156#[derive(Debug)]
159enum Es {
160 Ascii(String),
162 Utf8(String),
164 Binary(FromUtf8Error),
167}
168
169#[derive(Debug)]
171pub struct EncodedString(Es);
172
173impl EncodedString {
174 pub fn as_bytes(&self) -> &[u8] {
176 match &self.0 {
177 Es::Ascii(s) | Es::Utf8(s) => s.as_bytes(),
178 Es::Binary(e) => e.as_bytes(),
179 }
180 }
181
182 pub fn as_str(&self) -> Result<&str, Utf8Error> {
189 match &self.0 {
190 Es::Ascii(s) | Es::Utf8(s) => Ok(s),
191 Es::Binary(e) => Err(e.utf8_error()),
192 }
193 }
194
195 pub fn encoding(&self) -> Encoding {
197 match self.0 {
198 Es::Ascii(_) => Encoding::Ascii,
199 Es::Utf8(_) => Encoding::Utf8,
200 Es::Binary(_) => Encoding::Binary,
201 }
202 }
203}
204
205impl From<EncodedString> for Vec<u8> {
206 fn from(es: EncodedString) -> Self {
207 match es.0 {
208 Es::Ascii(s) | Es::Utf8(s) => s.into_bytes(),
209 Es::Binary(e) => e.into_bytes(),
210 }
211 }
212}
213
214impl From<Vec<u8>> for EncodedString {
215 fn from(b: Vec<u8>) -> Self {
216 match String::from_utf8(b) {
217 Ok(s) => Self::from(s),
218 Err(e) => Self(Es::Binary(e)),
219 }
220 }
221}
222
223impl From<String> for EncodedString {
224 fn from(s: String) -> Self {
225 Self(if s.is_ascii() {
226 Es::Ascii(s)
227 } else {
228 Es::Utf8(s)
229 })
230 }
231}
232
233impl TryFrom<EncodedString> for String {
234 type Error = FromUtf8Error;
235 fn try_from(es: EncodedString) -> Result<Self, Self::Error> {
236 match es.0 {
237 Es::Ascii(s) | Es::Utf8(s) => Ok(s),
238 Es::Binary(e) => Err(e),
239 }
240 }
241}
242
243impl PartialEq for EncodedString {
244 fn eq(&self, other: &Self) -> bool {
245 self.as_bytes() == other.as_bytes()
246 }
247}
248
249impl Eq for EncodedString {}
250
251impl PartialOrd for EncodedString {
252 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
253 Some(self.cmp(other))
254 }
255}
256
257impl Ord for EncodedString {
258 fn cmp(&self, other: &Self) -> Ordering {
259 self.as_bytes().cmp(other.as_bytes())
260 }
261}
262
263impl Hash for EncodedString {
264 fn hash<H: Hasher>(&self, state: &mut H) {
265 self.as_bytes().hash(state);
266 }
267}
268
269#[derive(Clone, Debug)]
271pub struct Regex {
272 compiled: Compiled,
273 capacity: usize,
274 encoding: Encoding,
275}
276
277impl Distribution<Vec<u8>> for Regex {
278 fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Vec<u8> {
280 let mut ctx = EvalCtx {
281 output: Vec::with_capacity(self.capacity),
282 rng,
283 };
284 ctx.eval(&self.compiled);
285 ctx.output
286 }
287}
288
289impl Distribution<String> for Regex {
290 fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> String {
299 <Self as Distribution<Result<_, _>>>::sample(self, rng).unwrap()
300 }
301}
302
303impl Distribution<Result<String, FromUtf8Error>> for Regex {
304 fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Result<String, FromUtf8Error> {
309 let bytes = <Self as Distribution<Vec<u8>>>::sample(self, rng);
310 if self.is_utf8() {
311 unsafe { Ok(String::from_utf8_unchecked(bytes)) }
312 } else {
313 String::from_utf8(bytes)
314 }
315 }
316}
317
318impl Distribution<EncodedString> for Regex {
319 fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> EncodedString {
320 let result = <Self as Distribution<Result<_, _>>>::sample(self, rng);
321 EncodedString(match result {
322 Err(e) => Es::Binary(e),
323 Ok(s) => {
324 if self.is_ascii() || s.is_ascii() {
325 Es::Ascii(s)
326 } else {
327 Es::Utf8(s)
328 }
329 }
330 })
331 }
332}
333
334impl Default for Regex {
335 #[inline]
346 fn default() -> Self {
347 Self {
348 compiled: Compiled::default(),
349 capacity: 0,
350 encoding: Encoding::Ascii,
351 }
352 }
353}
354
355impl Regex {
356 pub const fn encoding(&self) -> Encoding {
358 self.encoding
359 }
360
361 #[inline]
372 pub const fn is_ascii(&self) -> bool {
373 (self.encoding as u8) == (Encoding::Ascii as u8)
375 }
376
377 #[inline]
404 pub const fn is_utf8(&self) -> bool {
405 (self.encoding as u8) <= (Encoding::Utf8 as u8)
407 }
408
409 #[inline]
421 pub const fn capacity(&self) -> usize {
422 self.capacity
423 }
424
425 pub fn compile(pattern: &str, max_repeat: u32) -> Result<Self, Error> {
444 let hir = Parser::new().parse(pattern)?;
445 Self::with_hir(hir, max_repeat)
446 }
447
448 pub fn with_hir(hir: Hir, max_repeat: u32) -> Result<Self, Error> {
460 match hir.into_kind() {
461 HirKind::Empty => Ok(Self::default()),
462 HirKind::Look(_) => Err(Error::Anchor),
463 HirKind::Capture(hir::Capture { sub, .. }) => Self::with_hir(*sub, max_repeat),
464
465 HirKind::Literal(hir::Literal(bytes)) => Ok(Self::with_bytes_literal(bytes.into())),
466 HirKind::Class(hir::Class::Unicode(class)) => Self::with_unicode_class(&class),
467 HirKind::Class(hir::Class::Bytes(class)) => Self::with_byte_class(&class),
468 HirKind::Repetition(rep) => Self::with_repetition(rep, max_repeat),
469 HirKind::Concat(hirs) => Self::with_sequence(hirs, max_repeat),
470 HirKind::Alternation(hirs) => Self::with_choices(hirs, max_repeat),
471 }
472 }
473
474 fn with_bytes_literal(bytes: Vec<u8>) -> Self {
475 let es = EncodedString::from(bytes);
476 let encoding = es.encoding();
477 let bytes = Vec::from(es);
478 Self {
479 capacity: bytes.len(),
480 compiled: Kind::Literal(bytes).into(),
481 encoding,
482 }
483 }
484
485 fn with_unicode_class(class: &ClassUnicode) -> Result<Self, Error> {
486 Ok(if let Some(byte_class) = class.to_byte_class() {
487 Self::with_byte_class(&byte_class)?
488 } else {
489 Self {
490 compiled: compile_unicode_class(class.ranges())?.into(),
491 capacity: class.maximum_len().unwrap_or(0),
492 encoding: Encoding::Utf8,
493 }
494 })
495 }
496
497 fn with_byte_class(class: &ClassBytes) -> Result<Self, Error> {
498 Ok(Self {
499 compiled: Kind::ByteClass(ByteClass::compile(class.ranges())?).into(),
500 capacity: 1,
501 encoding: if class.is_ascii() {
502 Encoding::Ascii
503 } else {
504 Encoding::Binary
505 },
506 })
507 }
508
509 fn with_repetition(rep: Repetition, max_repeat: u32) -> Result<Self, Error> {
510 let lower = rep.min;
511 let upper = rep.max.unwrap_or(lower + max_repeat);
512
513 if upper == 0 {
515 return Ok(Self::default());
516 }
517
518 let mut regex = Self::with_hir(*rep.sub, max_repeat)?;
519 regex.capacity *= upper as usize;
520 if lower == upper {
521 regex.compiled.repeat_const *= upper;
522 } else {
523 regex
524 .compiled
525 .repeat_ranges
526 .push(Uniform::new_inclusive(lower, upper).map_err(|_| Error::Unsatisfiable)?);
527 }
528
529 if let Kind::Literal(lit) = &mut regex.compiled.kind {
531 if regex.compiled.repeat_const > 1 {
532 *lit = lit.repeat(regex.compiled.repeat_const as usize);
533 regex.compiled.repeat_const = 1;
534 }
535 }
536
537 Ok(regex)
538 }
539
540 fn with_sequence(hirs: Vec<Hir>, max_repeat: u32) -> Result<Self, Error> {
541 let mut seq = Vec::with_capacity(hirs.len());
542 let mut capacity = 0;
543 let mut encoding = Encoding::Ascii;
544
545 for hir in hirs {
546 let regex = Self::with_hir(hir, max_repeat)?;
547 capacity += regex.capacity;
548 encoding = encoding.max(regex.encoding);
549 let compiled = regex.compiled;
550 if compiled.is_single() {
551 if let Kind::Sequence(mut s) = compiled.kind {
553 seq.append(&mut s);
554 continue;
555 }
556 }
557 seq.push(compiled);
558 }
559
560 let mut simplified = Vec::with_capacity(seq.len());
562 let mut combined_lit = Vec::new();
563 for cur in seq {
564 if cur.is_single() {
565 if let Kind::Literal(mut lit) = cur.kind {
566 combined_lit.append(&mut lit);
567 continue;
568 }
569 }
570 if !combined_lit.is_empty() {
571 simplified.push(Kind::Literal(mem::take(&mut combined_lit)).into());
572 }
573 simplified.push(cur);
574 }
575
576 if !combined_lit.is_empty() {
577 simplified.push(Kind::Literal(combined_lit).into());
578 }
579
580 let compiled = match simplified.len() {
581 0 => return Ok(Self::default()),
582 1 => simplified.swap_remove(0),
583 _ => Kind::Sequence(simplified).into(),
584 };
585
586 Ok(Self {
587 compiled,
588 capacity,
589 encoding,
590 })
591 }
592
593 fn with_choices(hirs: Vec<Hir>, max_repeat: u32) -> Result<Self, Error> {
594 let mut choices = Vec::with_capacity(hirs.len());
595 let mut capacity = 0;
596 let mut encoding = Encoding::Ascii;
597 for hir in hirs {
598 let regex = Self::with_hir(hir, max_repeat)?;
599 if regex.capacity > capacity {
600 capacity = regex.capacity;
601 }
602 encoding = encoding.max(regex.encoding);
603
604 let compiled = regex.compiled;
605 if compiled.is_single() {
606 if let Kind::Any {
607 choices: mut sc, ..
608 } = compiled.kind
609 {
610 choices.append(&mut sc);
611 continue;
612 }
613 }
614 choices.push(compiled);
615 }
616 Ok(Self {
617 compiled: Kind::Any {
618 index: Uniform::new(0, choices.len()).map_err(|_| Error::Unsatisfiable)?,
619 choices,
620 }
621 .into(),
622 capacity,
623 encoding,
624 })
625 }
626}
627
628#[derive(Clone, Debug)]
630struct Compiled {
631 repeat_const: u32,
633 repeat_ranges: Vec<Uniform<u32>>,
635 kind: Kind,
637}
638
639impl Default for Compiled {
640 fn default() -> Self {
641 Kind::default().into()
642 }
643}
644
645impl Compiled {
646 fn is_single(&self) -> bool {
648 self.repeat_const == 1 && self.repeat_ranges.is_empty()
649 }
650}
651
652#[derive(Clone, Debug)]
653enum Kind {
654 Literal(Vec<u8>),
655 Sequence(Vec<Compiled>),
656 Any {
657 index: Uniform<usize>,
658 choices: Vec<Compiled>,
659 },
660 LongUnicodeClass(LongUnicodeClass),
661 ShortUnicodeClass(ShortUnicodeClass),
662 ByteClass(ByteClass),
663}
664
665impl Default for Kind {
666 fn default() -> Self {
667 Self::Literal(Vec::new())
668 }
669}
670
671impl From<Kind> for Compiled {
672 fn from(kind: Kind) -> Self {
673 Self {
674 repeat_const: 1,
675 repeat_ranges: Vec::new(),
676 kind,
677 }
678 }
679}
680
681struct EvalCtx<'a, R: ?Sized + 'a> {
682 output: Vec<u8>,
683 rng: &'a mut R,
684}
685
686impl<'a, R: Rng + ?Sized + 'a> EvalCtx<'a, R> {
687 fn eval(&mut self, compiled: &Compiled) {
688 let count = compiled
689 .repeat_ranges
690 .iter()
691 .fold(compiled.repeat_const, |c, u| c * u.sample(self.rng));
692
693 match &compiled.kind {
694 Kind::Literal(lit) => self.eval_literal(count, lit),
695 Kind::Sequence(seq) => self.eval_sequence(count, seq),
696 Kind::Any { index, choices } => self.eval_alt(count, index, choices),
697 Kind::LongUnicodeClass(class) => self.eval_unicode_class(count, class),
698 Kind::ShortUnicodeClass(class) => self.eval_unicode_class(count, class),
699 Kind::ByteClass(class) => self.eval_byte_class(count, class),
700 }
701 }
702
703 fn eval_literal(&mut self, count: u32, lit: &[u8]) {
704 for _ in 0..count {
705 self.output.extend_from_slice(lit);
706 }
707 }
708
709 fn eval_sequence(&mut self, count: u32, seq: &[Compiled]) {
710 for _ in 0..count {
711 for compiled in seq {
712 self.eval(compiled);
713 }
714 }
715 }
716
717 fn eval_alt(&mut self, count: u32, index: &Uniform<usize>, choices: &[Compiled]) {
718 for _ in 0..count {
719 let idx = index.sample(self.rng);
720 self.eval(&choices[idx]);
721 }
722 }
723
724 fn eval_unicode_class(&mut self, count: u32, class: &impl Distribution<char>) {
725 let mut buf = [0; 4];
726 for c in class.sample_iter(&mut self.rng).take(count as usize) {
727 let bytes = c.encode_utf8(&mut buf).as_bytes();
728 self.output.extend_from_slice(bytes);
729 }
730 }
731
732 fn eval_byte_class(&mut self, count: u32, class: &ByteClass) {
733 self.output
734 .extend(self.rng.sample_iter(class).take(count as usize));
735 }
736}
737
738#[derive(Clone, Debug)]
740struct LongUnicodeClass {
741 searcher: Uniform<u32>,
742 ranges: Box<[(u32, u32)]>,
743}
744
745impl Distribution<char> for LongUnicodeClass {
746 fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> char {
747 let normalized_index = self.searcher.sample(rng);
748 let entry_index = self
749 .ranges
750 .binary_search_by(|(normalized_start, _)| normalized_start.cmp(&normalized_index))
751 .unwrap_or_else(|e| e - 1);
752 let code = normalized_index + self.ranges[entry_index].1;
753 char::from_u32(code).expect("valid char")
754 }
755}
756
757#[derive(Clone, Debug)]
759struct ShortUnicodeClass {
760 index: Uniform<usize>,
761 cases: Box<[char]>,
762}
763
764impl Distribution<char> for ShortUnicodeClass {
765 fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> char {
766 self.cases[self.index.sample(rng)]
767 }
768}
769
770fn compile_unicode_class_with(ranges: &[hir::ClassUnicodeRange], mut push: impl FnMut(char, char)) {
771 for range in ranges {
772 let start = range.start();
773 let end = range.end();
774 if start <= '\u{d7ff}' && '\u{e000}' <= end {
775 push(start, '\u{d7ff}');
776 push('\u{e000}', end);
777 } else {
778 push(start, end);
779 }
780 }
781}
782
783fn compile_unicode_class(ranges: &[hir::ClassUnicodeRange]) -> Result<Kind, Error> {
784 let mut normalized_ranges = Vec::new();
785 let mut normalized_len = 0;
786 compile_unicode_class_with(ranges, |start, end| {
787 let start = u32::from(start);
788 let end = u32::from(end);
789 normalized_ranges.push((normalized_len, start - normalized_len));
790 normalized_len += end - start + 1;
791 });
792
793 if normalized_len as usize > SHORT_UNICODE_CLASS_COUNT {
794 return Ok(Kind::LongUnicodeClass(LongUnicodeClass {
795 searcher: Uniform::new(0, normalized_len).map_err(|_| Error::Unsatisfiable)?,
796 ranges: normalized_ranges.into_boxed_slice(),
797 }));
798 }
799
800 let mut cases = Vec::with_capacity(normalized_len as usize);
802 compile_unicode_class_with(ranges, |start, end| {
803 for c in u32::from(start)..=u32::from(end) {
804 cases.push(char::from_u32(c).expect("valid char"));
805 }
806 });
807
808 Ok(Kind::ShortUnicodeClass(ShortUnicodeClass {
809 index: Uniform::new(0, cases.len()).map_err(|_| Error::Unsatisfiable)?,
810 cases: cases.into_boxed_slice(),
811 }))
812}
813
814#[derive(Clone, Debug)]
816struct ByteClass {
817 index: Uniform<usize>,
818 cases: Box<[u8]>,
819}
820
821impl ByteClass {
822 fn compile(ranges: &[hir::ClassBytesRange]) -> Result<Self, Error> {
823 let mut cases = Vec::with_capacity(256);
824 for range in ranges {
825 cases.extend(range.start()..=range.end());
826 }
827 Ok(Self {
828 index: Uniform::new(0, cases.len()).map_err(|_| Error::Unsatisfiable)?,
829 cases: cases.into_boxed_slice(),
830 })
831 }
832}
833
834impl Distribution<u8> for ByteClass {
835 fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> u8 {
836 self.cases[self.index.sample(rng)]
837 }
838}
839
840#[cfg(test)]
841mod test {
842 use super::*;
843 use rand::rng as thread_rng;
844 use std::collections::HashSet;
845 use std::ops::RangeInclusive;
846
847 fn check_str(
848 pattern: &str,
849 encoding: Encoding,
850 distinct_count: RangeInclusive<usize>,
851 run_count: usize,
852 ) {
853 let r = regex::Regex::new(pattern).unwrap();
854 let gen = Regex::compile(pattern, 100).unwrap();
855 assert!(gen.is_utf8());
856 assert_eq!(gen.encoding(), encoding);
857
858 let mut rng = thread_rng();
859
860 let mut gen_set = HashSet::<String>::with_capacity(run_count.min(*distinct_count.end()));
861 for res in (&gen).sample_iter(&mut rng).take(run_count) {
862 let res: String = res;
863 assert!(res.len() <= gen.capacity());
864 assert!(
865 r.is_match(&res),
866 "Wrong sample for pattern `{}`: `{}`",
867 pattern,
868 res
869 );
870 gen_set.insert(res);
871 }
872 let gen_count = gen_set.len();
873 assert!(
874 *distinct_count.start() <= gen_count && gen_count <= *distinct_count.end(),
875 "Distinct samples generated for pattern `{}` outside the range {:?}: {} (examples:\n{})",
876 pattern,
877 distinct_count,
878 gen_count,
879 gen_set.iter().take(10).map(|s| format!(" - {:#?}\n", s)).collect::<String>(),
880 );
881 }
882
883 fn run_count_for_distinct_count(distinct_count: usize) -> usize {
884 if distinct_count <= 1 {
899 return 8;
900 }
901 let n = distinct_count as f64;
902 ((n.ln() + 6.0 * std::f64::consts::LN_10) / (n.ln() - (n - 1.0).ln())).ceil() as usize
903 }
904
905 #[test]
906 fn sanity_test_run_count() {
907 assert_eq!(run_count_for_distinct_count(1), 8);
908 assert_eq!(run_count_for_distinct_count(2), 21);
909 assert_eq!(run_count_for_distinct_count(3), 37);
910 assert_eq!(run_count_for_distinct_count(10), 153);
911 assert_eq!(run_count_for_distinct_count(26), 436);
912 assert_eq!(run_count_for_distinct_count(62), 1104);
913 assert_eq!(run_count_for_distinct_count(128), 2381);
914 assert_eq!(run_count_for_distinct_count(214), 4096);
915 }
916
917 fn check_str_limited(pattern: &str, encoding: Encoding, distinct_count: usize) {
918 let run_count = run_count_for_distinct_count(distinct_count);
919 check_str(
920 pattern,
921 encoding,
922 distinct_count..=distinct_count,
923 run_count,
924 );
925 }
926
927 fn check_str_unlimited(pattern: &str, encoding: Encoding, min_distinct_count: usize) {
928 check_str(pattern, encoding, min_distinct_count..=4096, 4096);
929 }
930
931 #[test]
932 fn test_proptest() {
933 check_str_limited("foo", Encoding::Ascii, 1);
934 check_str_limited("foo|bar|baz", Encoding::Ascii, 3);
935 check_str_limited("a{0,8}", Encoding::Ascii, 9);
936 check_str_limited("a?", Encoding::Ascii, 2);
937 check_str_limited("a*", Encoding::Ascii, 101);
938 check_str_limited("a+", Encoding::Ascii, 101);
939 check_str_limited("a{4,}", Encoding::Ascii, 101);
940 check_str_limited("(foo|bar)(xyzzy|plugh)", Encoding::Ascii, 4);
941 check_str_unlimited(".", Encoding::Utf8, 4075);
942 check_str_unlimited("(?s).", Encoding::Utf8, 4075);
943 }
944
945 #[test]
946 fn test_regex_generate() {
947 check_str_limited("", Encoding::Ascii, 1);
948 check_str_limited("aBcDe", Encoding::Ascii, 1);
949 check_str_limited("[a-zA-Z0-9]", Encoding::Ascii, 62);
950 check_str_limited("a{3,8}", Encoding::Ascii, 6);
951 check_str_limited("a{3}", Encoding::Ascii, 1);
952 check_str_limited("a{3}-a{3}", Encoding::Ascii, 1);
953 check_str_limited("(abcde)", Encoding::Ascii, 1);
954 check_str_limited("a?b?", Encoding::Ascii, 4);
955 }
956
957 #[test]
958 #[cfg(feature = "unicode")]
959 fn test_unicode_cases() {
960 check_str_limited("(?i:fOo)", Encoding::Ascii, 8);
961 check_str_limited("(?i:a|B)", Encoding::Ascii, 4);
962 check_str_unlimited(r"(\p{Greek}\P{Greek})(?:\d{3,6})", Encoding::Utf8, 4096);
963 }
964
965 #[test]
966 fn test_ascii_character_classes() {
967 check_str_limited("[[:alnum:]]", Encoding::Ascii, 62);
968 check_str_limited("[[:alpha:]]", Encoding::Ascii, 52);
969 check_str_limited("[[:ascii:]]", Encoding::Ascii, 128);
970 check_str_limited("[[:blank:]]", Encoding::Ascii, 2);
971 check_str_limited("[[:cntrl:]]", Encoding::Ascii, 33);
972 check_str_limited("[[:digit:]]", Encoding::Ascii, 10);
973 check_str_limited("[[:graph:]]", Encoding::Ascii, 94);
974 check_str_limited("[[:lower:]]", Encoding::Ascii, 26);
975 check_str_limited("[[:print:]]", Encoding::Ascii, 95);
976 check_str_limited("[[:punct:]]", Encoding::Ascii, 32);
977 check_str_limited("[[:space:]]", Encoding::Ascii, 6);
978 check_str_limited("[[:upper:]]", Encoding::Ascii, 26);
979 check_str_limited("[[:word:]]", Encoding::Ascii, 63);
980 check_str_limited("[[:xdigit:]]", Encoding::Ascii, 22);
981 }
982
983 #[test]
984 #[cfg(feature = "unicode")]
985 fn sanity_test_unicode_character_classes_size() {
986 fn count_class_chars(pattern: &str) -> usize {
995 use regex_syntax::{
996 hir::{Class, HirKind},
997 parse,
998 };
999
1000 let hir = parse(pattern).unwrap();
1001 let HirKind::Class(Class::Unicode(cls)) = hir.into_kind() else {
1002 unreachable!()
1003 };
1004 cls.iter().map(|r| r.len()).sum()
1007 }
1008
1009 assert_eq!(count_class_chars(r"\p{L}"), 141_028);
1010 assert_eq!(count_class_chars(r"\p{M}"), 2_501);
1011 assert_eq!(count_class_chars(r"\p{N}"), 1_911);
1012 assert_eq!(count_class_chars(r"\p{P}"), 855);
1013 assert_eq!(count_class_chars(r"\p{S}"), 8_514);
1014 assert_eq!(count_class_chars(r"\p{Z}"), 19);
1015 assert_eq!(count_class_chars(r"\p{C}"), 959_284);
1016
1017 assert_eq!(count_class_chars(r"\p{Latin}"), 1_487);
1018 assert_eq!(count_class_chars(r"\p{Greek}"), 518);
1019 assert_eq!(count_class_chars(r"\p{Cyrillic}"), 508);
1020 assert_eq!(count_class_chars(r"\p{Armenian}"), 96);
1021 assert_eq!(count_class_chars(r"\p{Hebrew}"), 134);
1022 assert_eq!(count_class_chars(r"\p{Arabic}"), 1_373);
1023 assert_eq!(count_class_chars(r"\p{Syriac}"), 88);
1024 assert_eq!(count_class_chars(r"\p{Thaana}"), 50);
1025 assert_eq!(count_class_chars(r"\p{Devanagari}"), 164);
1026 assert_eq!(count_class_chars(r"\p{Bengali}"), 96);
1027 assert_eq!(count_class_chars(r"\p{Gurmukhi}"), 80);
1028 assert_eq!(count_class_chars(r"\p{Gujarati}"), 91);
1029 assert_eq!(count_class_chars(r"\p{Oriya}"), 91);
1030 assert_eq!(count_class_chars(r"\p{Tamil}"), 123);
1031 assert_eq!(count_class_chars(r"\p{Hangul}"), 11_739);
1032 assert_eq!(count_class_chars(r"\p{Hiragana}"), 381);
1033 assert_eq!(count_class_chars(r"\p{Katakana}"), 321);
1034 assert_eq!(count_class_chars(r"\p{Han}"), 99_030);
1035 assert_eq!(count_class_chars(r"\p{Tagalog}"), 23);
1036 assert_eq!(count_class_chars(r"\p{Linear_B}"), 211);
1037 assert_eq!(count_class_chars(r"\p{Inherited}"), 657);
1038
1039 assert_eq!(count_class_chars(r"\d"), 760);
1040 assert_eq!(count_class_chars(r"\s"), 25);
1041 assert_eq!(count_class_chars(r"\w"), 144_667);
1042 }
1043
1044 #[test]
1045 #[cfg(feature = "unicode")]
1046 fn test_unicode_character_classes() {
1047 check_str_unlimited(r"\p{L}", Encoding::Utf8, 3999);
1078 check_str(r"\p{M}", Encoding::Utf8, 1941..=2501, 4096);
1079 check_str(r"\p{N}", Encoding::Utf8, 1630..=1911, 4096);
1080 check_str(r"\p{P}", Encoding::Utf8, 835..=855, 4096);
1081 check_str_unlimited(r"\p{S}", Encoding::Utf8, 3151);
1082 check_str_limited(r"\p{Z}", Encoding::Utf8, 19);
1083 check_str_unlimited(r"\p{C}", Encoding::Utf8, 4073);
1084
1085 check_str_unlimited(r"\P{L}", Encoding::Utf8, 4073);
1086 check_str_unlimited(r"\P{M}", Encoding::Utf8, 4075);
1087 check_str_unlimited(r"\P{N}", Encoding::Utf8, 4075);
1088 check_str_unlimited(r"\P{P}", Encoding::Utf8, 4075);
1089 check_str_unlimited(r"\P{S}", Encoding::Utf8, 4075);
1090 check_str_unlimited(r"\P{Z}", Encoding::Utf8, 4075);
1091 check_str_unlimited(r"\P{C}", Encoding::Utf8, 4007);
1092 }
1093
1094 #[test]
1095 #[cfg(feature = "unicode")]
1096 fn test_unicode_script_classes() {
1097 check_str(r"\p{Latin}", Encoding::Utf8, 1352..=1487, 4096);
1098 check_str(r"\p{Greek}", Encoding::Utf8, 516..=518, 4096);
1099 check_str(r"\p{Cyrillic}", Encoding::Utf8, 506..=508, 4096);
1100 check_str_limited(r"\p{Armenian}", Encoding::Utf8, 96);
1101 check_str_limited(r"\p{Hebrew}", Encoding::Utf8, 134);
1102 check_str(r"\p{Arabic}", Encoding::Utf8, 1268..=1373, 4096);
1103 check_str_limited(r"\p{Syriac}", Encoding::Utf8, 88);
1104 check_str_limited(r"\p{Thaana}", Encoding::Utf8, 50);
1105 check_str_limited(r"\p{Devanagari}", Encoding::Utf8, 164);
1106 check_str_limited(r"\p{Bengali}", Encoding::Utf8, 96);
1107 check_str_limited(r"\p{Gurmukhi}", Encoding::Utf8, 80);
1108 check_str_limited(r"\p{Gujarati}", Encoding::Utf8, 91);
1109 check_str_limited(r"\p{Oriya}", Encoding::Utf8, 91);
1110 check_str_limited(r"\p{Tamil}", Encoding::Utf8, 123);
1111 check_str_unlimited(r"\p{Hangul}", Encoding::Utf8, 3363);
1112 check_str_limited(r"\p{Hiragana}", Encoding::Utf8, 381);
1113 check_str_limited(r"\p{Katakana}", Encoding::Utf8, 321);
1114 check_str_unlimited(r"\p{Han}", Encoding::Utf8, 3970);
1115 check_str_limited(r"\p{Tagalog}", Encoding::Utf8, 23);
1116 check_str_limited(r"\p{Linear_B}", Encoding::Utf8, 211);
1117 check_str(r"\p{Inherited}", Encoding::Utf8, 650..=657, 4096);
1118 }
1119
1120 #[test]
1121 #[cfg(feature = "unicode")]
1122 fn test_perl_classes() {
1123 check_str_unlimited(r"\d+", Encoding::Utf8, 4061);
1124 check_str_unlimited(r"\D+", Encoding::Utf8, 4096);
1125 check_str_unlimited(r"\s+", Encoding::Utf8, 4014);
1126 check_str_unlimited(r"\S+", Encoding::Utf8, 4096);
1127 check_str_unlimited(r"\w+", Encoding::Utf8, 4095);
1128 check_str_unlimited(r"\W+", Encoding::Utf8, 4096);
1129 }
1130
1131 #[cfg(any())]
1132 fn dump_categories() {
1133 use regex_syntax::hir::*;
1134
1135 let categories = &[r"\p{Nd}", r"\p{Greek}"];
1136
1137 for cat in categories {
1138 if let HirKind::Class(Class::Unicode(cls)) =
1139 regex_syntax::Parser::new().parse(cat).unwrap().into_kind()
1140 {
1141 let s: u32 = cls
1142 .iter()
1143 .map(|r| u32::from(r.end()) - u32::from(r.start()) + 1)
1144 .sum();
1145 println!("{} => {}", cat, s);
1146 }
1147 }
1148 }
1149
1150 #[test]
1151 fn test_binary_generator() {
1152 const PATTERN: &str = r"PE\x00\x00.{20}";
1153
1154 let r = regex::bytes::RegexBuilder::new(PATTERN)
1155 .unicode(false)
1156 .dot_matches_new_line(true)
1157 .build()
1158 .unwrap();
1159
1160 let hir = regex_syntax::ParserBuilder::new()
1161 .unicode(false)
1162 .dot_matches_new_line(true)
1163 .utf8(false)
1164 .build()
1165 .parse(PATTERN)
1166 .unwrap();
1167
1168 let gen = Regex::with_hir(hir, 100).unwrap();
1169 assert_eq!(gen.capacity(), 24);
1170 assert!(!gen.is_utf8());
1171 assert_eq!(gen.encoding(), Encoding::Binary);
1172
1173 let mut rng = thread_rng();
1174 for res in gen.sample_iter(&mut rng).take(8192) {
1175 let res: Vec<u8> = res;
1176 assert!(r.is_match(&res), "Wrong sample: {:?}, `{:?}`", r, res);
1177 }
1178 }
1179
1180 #[test]
1181 fn test_encoding_generator_1() {
1182 let hir = regex_syntax::ParserBuilder::new()
1183 .unicode(false)
1184 .utf8(false)
1185 .build()
1186 .parse(r"[\x00-\xff]{2}")
1187 .unwrap();
1188 let gen = Regex::with_hir(hir, 100).unwrap();
1189
1190 let mut encoding_counts = [0; 3];
1196 let mut rng = thread_rng();
1197 for encoded_string in gen.sample_iter(&mut rng).take(8192) {
1198 let encoded_string: EncodedString = encoded_string;
1199 let bytes = encoded_string.as_bytes();
1200 let encoding = encoded_string.encoding();
1201 assert_eq!(bytes.len(), 2);
1202 if bytes.is_ascii() {
1203 assert_eq!(encoding, Encoding::Ascii);
1204 } else if std::str::from_utf8(bytes).is_ok() {
1205 assert_eq!(encoding, Encoding::Utf8);
1206 } else {
1207 assert_eq!(encoding, Encoding::Binary);
1208 }
1209 encoding_counts[encoding as usize] += 1;
1210 }
1211
1212 assert!((1858..2243).contains(&encoding_counts[Encoding::Ascii as usize]));
1214 assert!((169..319).contains(&encoding_counts[Encoding::Utf8 as usize]));
1215 assert!((5704..6102).contains(&encoding_counts[Encoding::Binary as usize]));
1216 }
1217
1218 #[test]
1219 fn test_encoding_generator_2() {
1220 let gen = Regex::compile(r"[\u{0}-\u{b5}]{2}", 100).unwrap();
1221
1222 let mut encoding_counts = [0; 2];
1227 let mut rng = thread_rng();
1228 for encoded_string in gen.sample_iter(&mut rng).take(8192) {
1229 let encoded_string: EncodedString = encoded_string;
1230 let encoding = encoded_string.encoding();
1231 let string = encoded_string.as_str().unwrap();
1232 assert_eq!(string.chars().count(), 2);
1233 if string.is_ascii() {
1234 assert_eq!(encoding, Encoding::Ascii);
1235 assert_eq!(string.len(), 2);
1236 } else {
1237 assert_eq!(encoding, Encoding::Utf8);
1238 }
1239 encoding_counts[encoding as usize] += 1;
1240 }
1241
1242 assert!((3876..4319).contains(&encoding_counts[Encoding::Ascii as usize]));
1244 assert!((3874..4317).contains(&encoding_counts[Encoding::Utf8 as usize]));
1245 }
1246
1247 #[test]
1248 fn test_encoding_generator_3() {
1249 let gen = Regex::compile(r"[\u{0}-\u{7f}]{2}", 100).unwrap();
1250 let mut rng = thread_rng();
1251 for encoded_string in gen.sample_iter(&mut rng).take(8192) {
1252 let encoded_string: EncodedString = encoded_string;
1253 assert_eq!(encoded_string.encoding(), Encoding::Ascii);
1254 assert_eq!(String::try_from(encoded_string).unwrap().len(), 2);
1255 }
1256 }
1257
1258 #[test]
1259 #[should_panic(expected = "FromUtf8Error")]
1260 fn test_generating_non_utf8_string() {
1261 let hir = regex_syntax::ParserBuilder::new()
1262 .unicode(false)
1263 .utf8(false)
1264 .build()
1265 .parse(r"\x88")
1266 .unwrap();
1267
1268 let gen = Regex::with_hir(hir, 100).unwrap();
1269 assert!(!gen.is_utf8());
1270 assert_eq!(gen.encoding(), Encoding::Binary);
1271
1272 let mut rng = thread_rng();
1273 let _: String = rng.sample(&gen);
1274 }
1275
1276 #[test]
1277 fn unsatisfiable_char_class_intersection() {
1278 assert!(matches!(
1279 Regex::compile("[a&&b]", 100),
1280 Err(Error::Unsatisfiable)
1281 ));
1282 }
1283}