1extern crate regex_syntax;
18
19use regex_syntax::hir::{
20 Class, ClassBytesRange, ClassUnicodeRange, Hir, Literal, RepetitionKind, RepetitionRange,
21};
22
23pub struct RomuPrng {
24 xstate: u64,
25 ystate: u64,
26}
27
28impl RomuPrng {
29 #[must_use]
30 pub fn new(xstate: u64, ystate: u64) -> Self {
31 Self { xstate, ystate }
32 }
33
34 pub fn range(&mut self, min: usize, max: usize) -> usize {
35 ((self.next_u64() as usize) % (max - min)) + min
36 }
37
38 #[must_use]
39 pub fn new_from_u64(seed: u64) -> Self {
40 let mut res = Self::new(seed, seed ^ 0xec77_1522_8265_0854);
41 for _ in 0..4 {
42 res.next_u64();
43 }
44 res
45 }
46
47 pub fn next_u32(&mut self) -> u32 {
48 self.next_u64() as u32
49 }
50
51 pub fn next_u64(&mut self) -> u64 {
52 let xp = self.xstate;
53 self.xstate = 15_241_094_284_759_029_579_u64.wrapping_mul(self.ystate);
54 self.ystate = self.ystate.wrapping_sub(xp);
55 self.ystate = self.ystate.rotate_left(27);
56 xp
57 }
58}
59
60pub struct RegexScript {
61 rng: RomuPrng,
62 remaining: usize,
63}
64
65impl RegexScript {
66 #[must_use]
67 pub fn new(seed: u64) -> Self {
68 let mut rng = RomuPrng::new_from_u64(seed);
69
70 let len = if rng.next_u64() % 256 == 0 {
71 rng.next_u64() % 0xffff
72 } else {
73 let len = 1 << (rng.next_u64() % 8);
74 rng.next_u64() % len
75 };
76 RegexScript {
77 rng,
78 remaining: len as usize,
79 }
80 }
81
82 pub fn get_mod(&mut self, val: usize) -> usize {
83 if self.remaining == 0 {
84 return 0;
85 }
86 (self.rng.next_u32() as usize) % val
87 }
88
89 pub fn get_range(&mut self, min: usize, max: usize) -> usize {
90 self.get_mod(max - min) + min
91 }
92}
93
94fn append_char(res: &mut Vec<u8>, chr: char) {
95 let mut buf = [0; 4];
96 res.extend_from_slice(chr.encode_utf8(&mut buf).as_bytes());
97}
98
99fn append_lit(res: &mut Vec<u8>, lit: &Literal) {
100 use regex_syntax::hir::Literal::{Byte, Unicode};
101
102 match lit {
103 Unicode(chr) => append_char(res, *chr),
104 Byte(b) => res.push(*b),
105 }
106}
107
108fn append_unicode_range(res: &mut Vec<u8>, scr: &mut RegexScript, cls: ClassUnicodeRange) {
109 let mut chr_a_buf = [0; 4];
110 let mut chr_b_buf = [0; 4];
111 cls.start().encode_utf8(&mut chr_a_buf);
112 cls.end().encode_utf8(&mut chr_b_buf);
113 let a = u32::from_le_bytes(chr_a_buf);
114 let b = u32::from_le_bytes(chr_b_buf);
115 let c = scr.get_range(a as usize, (b + 1) as usize) as u32;
116 append_char(res, std::char::from_u32(c).unwrap());
117}
118
119fn append_byte_range(res: &mut Vec<u8>, scr: &mut RegexScript, cls: ClassBytesRange) {
120 res.push(scr.get_range(cls.start() as usize, (cls.end() + 1) as usize) as u8);
121}
122
123fn append_class(res: &mut Vec<u8>, scr: &mut RegexScript, cls: &Class) {
124 use regex_syntax::hir::Class::{Bytes, Unicode};
125 match cls {
126 Unicode(cls) => {
127 let rngs = cls.ranges();
128 let rng = rngs[scr.get_mod(rngs.len())];
129 append_unicode_range(res, scr, rng);
130 }
131 Bytes(cls) => {
132 let rngs = cls.ranges();
133 let rng = rngs[scr.get_mod(rngs.len())];
134 append_byte_range(res, scr, rng);
135 }
136 }
137}
138
139fn get_length(scr: &mut RegexScript) -> usize {
140 let bits = scr.get_mod(8);
141 scr.get_mod(2 << bits)
142}
143
144fn get_repetition_range(rep: &RepetitionRange, scr: &mut RegexScript) -> usize {
145 use regex_syntax::hir::RepetitionRange::{AtLeast, Bounded, Exactly};
146 match rep {
147 Exactly(a) => *a as usize,
148 AtLeast(a) => get_length(scr) + (*a as usize),
149 Bounded(a, b) => scr.get_range(*a as usize, *b as usize),
150 }
151}
152
153fn get_repetitions(rep: &RepetitionKind, scr: &mut RegexScript) -> usize {
154 use regex_syntax::hir::RepetitionKind::{OneOrMore, Range, ZeroOrMore, ZeroOrOne};
155 match rep {
156 ZeroOrOne => scr.get_mod(2),
157 ZeroOrMore => get_length(scr),
158 OneOrMore => 1 + get_length(scr),
159 Range(rng) => get_repetition_range(rng, scr),
160 }
161}
162
163#[must_use]
164pub fn generate(hir: &Hir, seed: u64) -> Vec<u8> {
165 use regex_syntax::hir::HirKind::{
166 Alternation, Anchor, Class, Concat, Empty, Group, Literal, Repetition, WordBoundary,
167 };
168 let mut scr = RegexScript::new(seed);
169 let mut stack = vec![hir];
170 let mut res = vec![];
171 while !stack.is_empty() {
172 match stack.pop().unwrap().kind() {
173 Anchor(_) | WordBoundary(_) | Empty => {}
174 Literal(lit) => append_lit(&mut res, lit),
175 Class(cls) => append_class(&mut res, &mut scr, cls),
176 Repetition(rep) => {
177 let num = get_repetitions(&rep.kind, &mut scr);
178 for _ in 0..num {
179 stack.push(&rep.hir);
180 }
181 }
182 Group(grp) => stack.push(&grp.hir),
183 Concat(hirs) => hirs.iter().rev().for_each(|h| stack.push(h)),
184 Alternation(hirs) => stack.push(&hirs[scr.get_mod(hirs.len())]),
185 }
186 }
187 res
188}