regex_mutator/
lib.rs

1// Nautilus
2// Copyright (C) 2020  Daniel Teuchert, Cornelius Aschermann, Sergej Schumilo
3
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU Affero General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8
9// This program is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12// GNU Affero General Public License for more details.
13
14// You should have received a copy of the GNU Affero General Public License
15// along with this program.  If not, see <http://www.gnu.org/licenses/>.
16
17extern crate regex_syntax;
18
19use regex_syntax::hir::{
20    Class, ClassBytesRange, ClassUnicodeRange, Hir, Literal, RepetitionKind, RepetitionRange,
21};
22
23pub struct RomuPrng {
24    xstate: u64,
25    ystate: u64,
26}
27
28impl RomuPrng {
29    #[must_use]
30    pub fn new(xstate: u64, ystate: u64) -> Self {
31        Self { xstate, ystate }
32    }
33
34    pub fn range(&mut self, min: usize, max: usize) -> usize {
35        ((self.next_u64() as usize) % (max - min)) + min
36    }
37
38    #[must_use]
39    pub fn new_from_u64(seed: u64) -> Self {
40        let mut res = Self::new(seed, seed ^ 0xec77_1522_8265_0854);
41        for _ in 0..4 {
42            res.next_u64();
43        }
44        res
45    }
46
47    pub fn next_u32(&mut self) -> u32 {
48        self.next_u64() as u32
49    }
50
51    pub fn next_u64(&mut self) -> u64 {
52        let xp = self.xstate;
53        self.xstate = 15_241_094_284_759_029_579_u64.wrapping_mul(self.ystate);
54        self.ystate = self.ystate.wrapping_sub(xp);
55        self.ystate = self.ystate.rotate_left(27);
56        xp
57    }
58}
59
60pub struct RegexScript {
61    rng: RomuPrng,
62    remaining: usize,
63}
64
65impl RegexScript {
66    #[must_use]
67    pub fn new(seed: u64) -> Self {
68        let mut rng = RomuPrng::new_from_u64(seed);
69
70        let len = if rng.next_u64() % 256 == 0 {
71            rng.next_u64() % 0xffff
72        } else {
73            let len = 1 << (rng.next_u64() % 8);
74            rng.next_u64() % len
75        };
76        RegexScript {
77            rng,
78            remaining: len as usize,
79        }
80    }
81
82    pub fn get_mod(&mut self, val: usize) -> usize {
83        if self.remaining == 0 {
84            return 0;
85        }
86        (self.rng.next_u32() as usize) % val
87    }
88
89    pub fn get_range(&mut self, min: usize, max: usize) -> usize {
90        self.get_mod(max - min) + min
91    }
92}
93
94fn append_char(res: &mut Vec<u8>, chr: char) {
95    let mut buf = [0; 4];
96    res.extend_from_slice(chr.encode_utf8(&mut buf).as_bytes());
97}
98
99fn append_lit(res: &mut Vec<u8>, lit: &Literal) {
100    use regex_syntax::hir::Literal::{Byte, Unicode};
101
102    match lit {
103        Unicode(chr) => append_char(res, *chr),
104        Byte(b) => res.push(*b),
105    }
106}
107
108fn append_unicode_range(res: &mut Vec<u8>, scr: &mut RegexScript, cls: ClassUnicodeRange) {
109    let mut chr_a_buf = [0; 4];
110    let mut chr_b_buf = [0; 4];
111    cls.start().encode_utf8(&mut chr_a_buf);
112    cls.end().encode_utf8(&mut chr_b_buf);
113    let a = u32::from_le_bytes(chr_a_buf);
114    let b = u32::from_le_bytes(chr_b_buf);
115    let c = scr.get_range(a as usize, (b + 1) as usize) as u32;
116    append_char(res, std::char::from_u32(c).unwrap());
117}
118
119fn append_byte_range(res: &mut Vec<u8>, scr: &mut RegexScript, cls: ClassBytesRange) {
120    res.push(scr.get_range(cls.start() as usize, (cls.end() + 1) as usize) as u8);
121}
122
123fn append_class(res: &mut Vec<u8>, scr: &mut RegexScript, cls: &Class) {
124    use regex_syntax::hir::Class::{Bytes, Unicode};
125    match cls {
126        Unicode(cls) => {
127            let rngs = cls.ranges();
128            let rng = rngs[scr.get_mod(rngs.len())];
129            append_unicode_range(res, scr, rng);
130        }
131        Bytes(cls) => {
132            let rngs = cls.ranges();
133            let rng = rngs[scr.get_mod(rngs.len())];
134            append_byte_range(res, scr, rng);
135        }
136    }
137}
138
139fn get_length(scr: &mut RegexScript) -> usize {
140    let bits = scr.get_mod(8);
141    scr.get_mod(2 << bits)
142}
143
144fn get_repetition_range(rep: &RepetitionRange, scr: &mut RegexScript) -> usize {
145    use regex_syntax::hir::RepetitionRange::{AtLeast, Bounded, Exactly};
146    match rep {
147        Exactly(a) => *a as usize,
148        AtLeast(a) => get_length(scr) + (*a as usize),
149        Bounded(a, b) => scr.get_range(*a as usize, *b as usize),
150    }
151}
152
153fn get_repetitions(rep: &RepetitionKind, scr: &mut RegexScript) -> usize {
154    use regex_syntax::hir::RepetitionKind::{OneOrMore, Range, ZeroOrMore, ZeroOrOne};
155    match rep {
156        ZeroOrOne => scr.get_mod(2),
157        ZeroOrMore => get_length(scr),
158        OneOrMore => 1 + get_length(scr),
159        Range(rng) => get_repetition_range(rng, scr),
160    }
161}
162
163#[must_use]
164pub fn generate(hir: &Hir, seed: u64) -> Vec<u8> {
165    use regex_syntax::hir::HirKind::{
166        Alternation, Anchor, Class, Concat, Empty, Group, Literal, Repetition, WordBoundary,
167    };
168    let mut scr = RegexScript::new(seed);
169    let mut stack = vec![hir];
170    let mut res = vec![];
171    while !stack.is_empty() {
172        match stack.pop().unwrap().kind() {
173            Anchor(_) | WordBoundary(_) | Empty => {}
174            Literal(lit) => append_lit(&mut res, lit),
175            Class(cls) => append_class(&mut res, &mut scr, cls),
176            Repetition(rep) => {
177                let num = get_repetitions(&rep.kind, &mut scr);
178                for _ in 0..num {
179                    stack.push(&rep.hir);
180                }
181            }
182            Group(grp) => stack.push(&grp.hir),
183            Concat(hirs) => hirs.iter().rev().for_each(|h| stack.push(h)),
184            Alternation(hirs) => stack.push(&hirs[scr.get_mod(hirs.len())]),
185        }
186    }
187    res
188}