regress 0.2.0

A regular expression engine targeting EcmaScript syntax
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
//! Optimizations on regex IR

use crate::folds;
use crate::insn::{MAX_BYTE_SET_LENGTH, MAX_CHAR_SET_LENGTH};
use crate::ir::*;
use crate::types::BracketContents;

/// When unrolling a loop, the largest minimum count we will unroll.
const LOOP_UNROLL_THRESHOLD: usize = 5;

/// Things that a Pass may do.
pub enum PassAction {
    // Do nothing to the given node.
    Keep,

    // Notes that we modified the node in-place.
    Modified,

    // Remove the given node outright, effectively replacing it with empty.
    Remove,

    /// Replace the given node with a new Node.
    Replace(Node),
}

#[derive(Debug)]
struct Pass<'a, F>
where
    F: FnMut(&mut Node, &Walk) -> PassAction,
{
    // The function.
    func: &'a mut F,

    // Whether this pass has changed anything.
    changed: bool,
}

impl<'a, F> Pass<'a, F>
where
    F: FnMut(&mut Node, &Walk) -> PassAction,
{
    fn new(func: &'a mut F) -> Self {
        Pass {
            func,
            changed: false,
        }
    }

    fn run_postorder(&mut self, start: &mut Node) {
        walk_mut(
            true,
            start,
            &mut |n: &mut Node, walk: &mut Walk| match (self.func)(n, walk) {
                PassAction::Keep => {}
                PassAction::Modified => {
                    self.changed = true;
                }
                PassAction::Remove => {
                    *n = Node::Empty;
                    self.changed = true;
                }
                PassAction::Replace(newnode) => {
                    *n = newnode;
                    self.changed = true;
                }
            },
        )
    }

    fn run_to_fixpoint(&mut self, n: &mut Node) {
        debug_assert!(!self.changed, "Pass has already been run");
        loop {
            self.changed = false;
            self.run_postorder(n);
            if !self.changed {
                break;
            }
        }
    }
}

/// Run a "pass" on a regex, which is a function that takes a Node and maybe
/// returns a new node. \return true if something changed, false if nothing did.
fn run_pass<F>(r: &mut Regex, func: &mut F) -> bool
where
    F: FnMut(&mut Node, &Walk) -> PassAction,
{
    let mut p = Pass::new(func);
    p.run_to_fixpoint(&mut r.node);
    p.changed
}

// Here are some optimizations we support.

// Remove empty Nodes.
fn remove_empties(n: &mut Node, _w: &Walk) -> PassAction {
    match n {
        Node::Empty => PassAction::Keep,
        Node::Goal => PassAction::Keep,
        Node::Char { .. } => PassAction::Keep,
        Node::ByteSequence(v) => {
            if v.is_empty() {
                PassAction::Remove
            } else {
                PassAction::Keep
            }
        }

        // Note: do not remove empty sets. These always match against one character; an empty set
        // should just fail.
        Node::ByteSet(..) => PassAction::Keep,
        Node::CharSet(..) => PassAction::Keep,
        Node::Cat(nodes) => {
            let blen = nodes.len();
            nodes.retain(|nn| !nn.is_empty());
            if nodes.len() == blen {
                // Nothing was removed.
                PassAction::Keep
            } else {
                match nodes.len() {
                    0 => PassAction::Remove,
                    1 => PassAction::Replace(nodes.pop().unwrap()),
                    _ => PassAction::Modified,
                }
            }
        }
        Node::Alt(left, right) => {
            // Empty alt may match the empty string.
            // Remove it only if both sides are empty.
            if left.is_empty() && right.is_empty() {
                PassAction::Remove
            } else {
                PassAction::Keep
            }
        }
        Node::MatchAny => PassAction::Keep,
        Node::MatchAnyExceptLineTerminator => PassAction::Keep,
        Node::Anchor { .. } => PassAction::Keep,
        Node::Loop {
            quant,
            loopee,
            enclosed_groups,
        } => {
            // A loop is empty if it has an empty body, or 0 max iters.
            // But do not remove contained capture groups.
            if loopee.is_empty() || (quant.max == 0 && enclosed_groups.start == enclosed_groups.end)
            {
                PassAction::Remove
            } else {
                PassAction::Keep
            }
        }
        Node::Loop1CharBody { .. } => PassAction::Keep,
        Node::CaptureGroup(..) => {
            // Capture groups could in principle be optimized if they only match empties.
            PassAction::Keep
        }
        Node::WordBoundary { .. } => PassAction::Keep,
        Node::BackRef { .. } => PassAction::Keep,
        Node::Bracket { .. } => PassAction::Keep,
        Node::LookaroundAssertion {
            negate, contents, ..
        } => {
            // Negative arounds that match empties could in principle be optimized to always
            // fail. Here we only optimize positive ones.
            if !*negate && contents.is_empty() {
                PassAction::Remove
            } else {
                PassAction::Keep
            }
        }
    }
}

// Remove excess cats.
fn decat(n: &mut Node, _w: &Walk) -> PassAction {
    match n {
        Node::Cat(nodes) => {
            if nodes.is_empty() {
                PassAction::Remove
            } else if nodes.len() == 1 {
                PassAction::Replace(nodes.pop().unwrap())
            } else if nodes.iter().any(|nn| nn.is_cat()) {
                // Flatmap child cats.
                // Unfortunately we can't use flatmap() because there's no single iterator type
                // we can return.
                // Avoid copying nodes by switching them into owned vec.
                let mut catted = Vec::new();
                std::mem::swap(nodes, &mut catted);

                // Decat them.
                let mut decatted = Vec::new();
                for nn in catted {
                    match nn {
                        Node::Cat(mut nnodes) => {
                            decatted.append(&mut nnodes);
                        }
                        _ => decatted.push(nn),
                    }
                }
                PassAction::Replace(Node::Cat(decatted))
            } else {
                PassAction::Keep
            }
        }
        _ => PassAction::Keep,
    }
}

/// Unfold icase chars.
/// That means for case-insensitive characters, figure out everything that they
/// could match.
/// TODO: should cache unfolding.
fn unfold_icase_chars(n: &mut Node, _w: &Walk) -> PassAction {
    match *n {
        Node::Char { c, icase } if icase => {
            let unfolded = folds::unfold_char(c);
            debug_assert!(
                unfolded.contains(&c),
                "Char should always unfold to at least itself"
            );
            match unfolded.len() {
                0 => panic!("Char should always unfold to at least itself"),
                1 => {
                    // Character does not fold or unfold at all.
                    PassAction::Replace(Node::Char { c, icase: false })
                }
                2..=MAX_BYTE_SET_LENGTH => {
                    // We unfolded to 2+ characters.
                    PassAction::Replace(Node::CharSet(unfolded))
                }
                _ => panic!("Unfolded to more characters than we believed possible"),
            }
        }
        _ => PassAction::Keep,
    }
}

// Perform simple unrolling of loops that have a minimum.
fn unroll_loops(n: &mut Node, _w: &Walk) -> PassAction {
    match n {
        Node::Loop {
            loopee,
            quant,
            enclosed_groups,
        } => {
            // TODO: consider ignoring loops with nested sub-loops?

            // Do not unroll loops with enclosed groups.
            if enclosed_groups.start < enclosed_groups.end {
                return PassAction::Keep;
            }
            // Do not unroll large loops, or loops which may execute zero times.
            if quant.min == 0 || quant.min > LOOP_UNROLL_THRESHOLD {
                return PassAction::Keep;
            }

            // We made it through. Replace us with a cat.
            let mut unrolled = Vec::new();
            for _ in 0..quant.min {
                unrolled.push(loopee.as_mut().duplicate());
            }

            // We unrolled 'min' elements.
            // Maybe our loop is now empty.
            quant.max -= quant.min;
            quant.min = 0;
            if quant.max > 0 {
                // Move the loop to the end of unrolled.
                let mut loop_node = Node::Empty;
                std::mem::swap(&mut loop_node, n);
                unrolled.push(loop_node);
            }
            *n = Node::Cat(unrolled);
            PassAction::Modified
        }
        _ => PassAction::Keep,
    }
}

/// Replace Loops with 1Char loops whenever possible.
fn promote_1char_loops(n: &mut Node, _w: &Walk) -> PassAction {
    match n {
        Node::Loop {
            loopee,
            quant,
            enclosed_groups,
        } => {
            // Must be 1Char.
            if !loopee.matches_exactly_one_char() {
                return PassAction::Keep;
            }

            // The above check should be sufficient to ensure we have no enclosed groups.
            assert!(
                enclosed_groups.start >= enclosed_groups.end,
                "Should have no enclosed groups"
            );

            // This feels hackish?
            let mut new_loopee = Box::new(Node::Empty);
            std::mem::swap(&mut new_loopee, loopee);

            *n = Node::Loop1CharBody {
                loopee: new_loopee,
                quant: *quant,
            };
            PassAction::Modified
        }
        _ => PassAction::Keep,
    }
}

/// Replace Cat(Char) with ByteSeq.
/// Also replace chars with literal bytes.
/// TODO: this seems to do too much; consider breaking this up.
fn form_literal_bytes(n: &mut Node, walk: &Walk) -> PassAction {
    // Helper to return a mutable reference to the nodes of a literal bytes.
    fn get_literal_bytes(n: &mut Node) -> Option<&mut Vec<u8>> {
        match n {
            Node::ByteSequence(v) => Some(v),
            _ => None,
        }
    }
    match n {
        Node::Char { c, icase } if !*icase => {
            let mut buff = [0; 4];
            PassAction::Replace(Node::ByteSequence(
                c.encode_utf8(&mut buff).as_bytes().to_vec(),
            ))
        }
        Node::CharSet(chars) if chars.iter().all(char::is_ascii) => {
            // All of our chars are ASCII; use a byte set instead.
            PassAction::Replace(Node::ByteSet(
                chars.iter().map(|&c| c as u32 as u8).collect(),
            ))
        }
        Node::Cat(nodes) => {
            // Find and merge adjacent ByteSeq.
            let mut modified = false;
            for idx in 1..nodes.len() {
                let (prev_slice, curr_slice) = nodes.split_at_mut(idx);
                match (
                    get_literal_bytes(prev_slice.last_mut().unwrap()),
                    get_literal_bytes(curr_slice.first_mut().unwrap()),
                ) {
                    (Some(prev_bytes), Some(curr_bytes))
                        if !prev_bytes.is_empty() && !curr_bytes.is_empty() =>
                    {
                        if walk.in_lookbehind {
                            // Our characters were already reversed; we need to reverse them again
                            // as literal bytes are always forwards. For
                            // example, if we have (<=ab), then we will get Cat(b, a) but we want
                            // literal bytes "ab".
                            curr_bytes.append(prev_bytes);
                        } else {
                            prev_bytes.append(curr_bytes);
                            std::mem::swap(prev_bytes, curr_bytes);
                        }
                        modified = true;
                    }
                    _ => (),
                }
            }
            if modified {
                PassAction::Modified
            } else {
                PassAction::Keep
            }
        }
        _ => PassAction::Keep,
    }
}

/// Try to reduce a bracket to something simpler.
fn try_reduce_bracket(bc: &BracketContents) -> Option<Node> {
    if bc.invert {
        // Give up.
        return None;
    }

    // Count the number of code points.
    let mut cps_count = 0;
    for iv in bc.cps.intervals() {
        cps_count += iv.count_codepoints();
    }
    if cps_count > MAX_CHAR_SET_LENGTH {
        // Too many code points.
        return None;
    }

    // Ok, we want to make a char set.
    // Note we cannot make a char out of surrogates; should char conversion fail we
    // just give up.
    let mut res = Vec::new();
    for iv in bc.cps.intervals() {
        for cp in iv.codepoints() {
            res.push(std::char::from_u32(cp)?);
        }
    }
    debug_assert!(res.len() <= MAX_CHAR_SET_LENGTH, "Unexpectedly many chars");
    Some(Node::CharSet(res))
}

/// Optimize brackets like [a-b].
/// Optimize certain stupid brackets like [a] to a single char.
/// Invert a bracket if it would *reduce* the number of ranges.
/// Note we only run this once.
fn simplify_brackets(n: &mut Node, _walk: &Walk) -> PassAction {
    match n {
        Node::Bracket(bc) => {
            if let Some(new_node) = try_reduce_bracket(bc) {
                return PassAction::Replace(new_node);
            }

            // TODO: does this ever help anything?
            if bc.cps.intervals().len() > bc.cps.inverted_interval_count() {
                bc.cps = bc.cps.inverted();
                bc.invert = !bc.invert;
                PassAction::Modified
            } else {
                PassAction::Keep
            }
        }
        _ => PassAction::Keep,
    }
}

pub fn optimize(r: &mut Regex) {
    run_pass(r, &mut simplify_brackets);
    loop {
        let mut changed = false;
        changed |= run_pass(r, &mut decat);
        if r.flags.icase {
            changed |= run_pass(r, &mut unfold_icase_chars);
        }
        changed |= run_pass(r, &mut unroll_loops);
        changed |= run_pass(r, &mut promote_1char_loops);
        changed |= run_pass(r, &mut form_literal_bytes);
        changed |= run_pass(r, &mut remove_empties);
        if !changed {
            break;
        }
    }
}