regex-cli 0.2.3

A command line tool for debugging, ad hoc benchmarking and generating regular expressions.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
use std::{
    collections::HashSet,
    fs::File,
    io::{BufRead, Read, Write},
    path::{Path, PathBuf},
};

use {
    anyhow::Context,
    bstr::ByteSlice,
    lexopt::{Arg, Parser},
};

use crate::args::{self, Usage};

pub fn run(p: &mut Parser) -> anyhow::Result<()> {
    const USAGE: &'static str = r#"
Generate TOML tests from Glenn Fowler's regex test suite.

This corresponds to a very sizeable set of regex tests that were written many
moons ago. They have been tweaked slightly by both Russ Cox and myself (Andrew
Gallant).

This tool is the spiritual successor of some hacky Python scripts. Its input is
a bespoke plain text format matching the original test data, and its output are
TOML files meant to work with the 'regex-test' crate.

Example usage from the root of this repository:

    regex-cli generate fowler tests/data/fowler tests/data/fowler/dat/*.dat

See tests/data/fowler/dat/README for more context.

USAGE:
    regex-cli generate fowler <outdir> <datfile> ...

    outdir should be a path to a directory where you want the TOML files
    to be written. datfile should be a Glenn Fowler dat file whose format
    is bespoke.
"#;

    let mut config = Config::default();
    args::configure(p, USAGE, &mut [&mut config])?;

    let outdir = config.outdir()?;
    let datfiles = config.datfiles()?;
    for datfile in datfiles.iter() {
        let datfile = Path::new(datfile);
        let stem = match datfile.file_stem() {
            Some(stem) => stem.to_string_lossy(),
            None => anyhow::bail!("{}: has no file stem", datfile.display()),
        };
        let tomlfile = outdir.join(format!("{stem}.toml"));

        let mut rdr = File::open(datfile)
            .with_context(|| datfile.display().to_string())?;
        let mut wtr = File::create(&tomlfile)
            .with_context(|| tomlfile.display().to_string())?;
        convert(&stem, &mut rdr, &mut wtr)
            .with_context(|| stem.to_string())?;
    }
    Ok(())
}

#[derive(Debug, Default)]
struct Config {
    outdir: Option<PathBuf>,
    datfiles: Vec<PathBuf>,
}

impl Config {
    fn outdir(&self) -> anyhow::Result<&Path> {
        self.outdir
            .as_deref()
            .ok_or_else(|| anyhow::anyhow!("missing <outdir>"))
    }

    fn datfiles(&self) -> anyhow::Result<&[PathBuf]> {
        anyhow::ensure!(!self.datfiles.is_empty(), "no Fowler datfiles given",);
        Ok(&self.datfiles)
    }
}

impl args::Configurable for Config {
    fn configure(
        &mut self,
        _: &mut Parser,
        arg: &mut Arg,
    ) -> anyhow::Result<bool> {
        match *arg {
            Arg::Value(ref mut value) => {
                let path = PathBuf::from(std::mem::take(value));
                if self.outdir.is_none() {
                    self.outdir = Some(path);
                } else {
                    self.datfiles.push(path);
                }
            }
            _ => return Ok(false),
        }
        Ok(true)
    }

    fn usage(&self) -> &[Usage] {
        const USAGES: &'static [Usage] = &[];
        USAGES
    }
}

fn convert(
    mut group_name: &str,
    src: &mut dyn Read,
    dst: &mut dyn Write,
) -> anyhow::Result<()> {
    log::trace!("processing {group_name}");
    let src = std::io::BufReader::new(src);

    writeln!(
        dst,
        "\
# !!! DO NOT EDIT !!!
# Automatically generated by 'regex-cli generate fowler'.
# Numbers in the test names correspond to the line number of the test from
# the original dat file.
"
    )?;

    let mut prev = None;
    for (i, result) in src.lines().enumerate() {
        // Every usize can fit into a u64... Right?
        let line_number = u64::try_from(i).unwrap().checked_add(1).unwrap();
        let line = result.with_context(|| format!("line {line_number}"))?;
        // The last group of tests in 'repetition' take quite a lot of time
        // when using them to build and minimize a DFA. So we tag them with
        // 'expensive' so that we can skip those tests when we need to minimize
        // a DFA.
        if group_name == "repetition" && line.contains("Chris Kuklewicz") {
            group_name = "repetition-expensive";
        }
        if line.trim().is_empty() || line.starts_with('#') {
            // Too noisy to log that we're skipping an empty or commented test.
            continue;
        }
        let dat = match DatTest::parse(prev.as_ref(), line_number, &line)? {
            None => continue,
            Some(dat) => dat,
        };
        let toml = TomlTest::from_dat_test(group_name, &dat)?;
        writeln!(dst, "{toml}")?;
        prev = Some(dat);
    }
    Ok(())
}

#[derive(Debug)]
struct TomlTest {
    group_name: String,
    line_number: u64,
    regex: String,
    haystack: String,
    captures: Vec<Option<(u64, u64)>>,
    unescape: bool,
    case_insensitive: bool,
    comment: Option<String>,
}

impl TomlTest {
    fn from_dat_test(
        group_name: &str,
        dat: &DatTest,
    ) -> anyhow::Result<TomlTest> {
        let mut captures = dat.captures.clone();
        if !captures.is_empty() {
            // Many of the Fowler tests don't actually list out every capturing
            // group match, and they instead stop once all remaining capturing
            // groups are empty. In effect, it makes writing tests terser,
            // but adds more implicitness. The TOML test suite does not make
            // this trade off (to this extent anyway), so it really wants all
            // capturing groups...
            //
            // So what we do here is look for the number of groups in the
            // pattern and then just pad out the capture matches with None
            // values to make the number of capture matches equal to what we
            // would expect from the pattern. (We actually parse the regex to
            // determine this.)
            //
            // Sadly, this doesn't work for a small subset of tests that
            // actually have more capturing group MATCHES than what is listed
            // explicitly in the test. Instead, the test includes an 'nmatch'
            // instruction that instructs the test runner to only consider the
            // first N capturing groups. Our test runner has no such option...
            // To fix that, I rewrote the tests to use non-capturing groups in
            // order to match the expected number of capture matches.
            let numcaps = count_capturing_groups(&dat.regex)?;
            for _ in captures.len()..numcaps {
                captures.push(None);
            }
        }

        let comment = if dat.re2go {
            Some("Test added by RE2/Go project.".to_string())
        } else if dat.rust {
            Some("Test added by Rust regex project.".to_string())
        } else {
            None
        };
        Ok(TomlTest {
            group_name: group_name.to_string(),
            line_number: dat.line_number,
            regex: dat.regex.clone(),
            haystack: dat.haystack.clone(),
            captures,
            unescape: dat.flags.contains(&'$'),
            case_insensitive: dat.flags.contains(&'i'),
            comment,
        })
    }
}

impl core::fmt::Display for TomlTest {
    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
        if let Some(ref comment) = self.comment {
            writeln!(f, "# {comment}")?;
        }
        writeln!(f, "[[test]]")?;
        writeln!(f, "name = \"{}{}\"", self.group_name, self.line_number)?;
        writeln!(f, "regex = '''{}'''", self.regex)?;
        writeln!(f, "haystack = '''{}'''", self.haystack)?;
        if self.captures.is_empty() {
            writeln!(f, "matches = []")?;
        } else {
            write!(f, "matches = [[")?;
            for (i, &group) in self.captures.iter().enumerate() {
                if i > 0 {
                    write!(f, ", ")?;
                }
                match group {
                    None => write!(f, "[]")?,
                    Some((start, end)) => write!(f, "[{start}, {end}]")?,
                }
            }
            writeln!(f, "]]")?;
        }
        writeln!(f, "match-limit = 1")?;
        // If the match starts at 0, then set anchored=true. This gives us more
        // coverage on the anchored option and lets regex engines like the
        // one-pass DFA participate a bit more in the test suite.
        if self
            .captures
            .get(0)
            .and_then(|&s| s)
            .map_or(false, |span| span.0 == 0)
        {
            writeln!(f, "anchored = true")?;
        }
        if self.unescape {
            writeln!(f, "unescape = true")?;
        }
        if self.case_insensitive {
            writeln!(f, "case-insensitive = true")?;
        }
        Ok(())
    }
}

#[derive(Debug)]
struct DatTest {
    line_number: u64,
    flags: HashSet<char>,
    regex: String,
    haystack: String,
    captures: Vec<Option<(u64, u64)>>,
    re2go: bool,
    rust: bool,
}

impl DatTest {
    fn parse(
        prev: Option<&DatTest>,
        line_number: u64,
        line: &str,
    ) -> anyhow::Result<Option<DatTest>> {
        let fields: Vec<String> = line
            .split('\t')
            .map(|f| f.trim().to_string())
            .filter(|f| !f.is_empty())
            .collect();
        if !(4 <= fields.len() && fields.len() <= 5) {
            log::trace!(
                "skipping {}: too few or too many fields ({})",
                line_number,
                fields.len(),
            );
            return Ok(None);
        }

        // First field contains terse one-letter flags.
        let mut flags: HashSet<char> = fields[0].chars().collect();
        if !flags.contains(&'E') {
            log::trace!("skipping {line_number}: does not contain 'E' flag");
            return Ok(None);
        }

        // Second field contains the regex pattern or 'SAME' if it's the
        // same as the regex in the previous test.
        let mut regex = fields[1].clone();
        if regex == "SAME" {
            regex = match prev {
                Some(test) => test.regex.clone(),
                None => anyhow::bail!(
                    "line {line_number}: wants previous pattern \
                     but none is available"
                ),
            };
        }

        // Third field contains the text to search or 'NULL'.
        let mut haystack = fields[2].clone();
        if haystack == "NULL" {
            haystack = "".to_string();
        }

        // Some tests have literal control characters in the regex/input
        // instead of using escapes. TOML freaks out at this, so we detect the
        // case, escape them and add '$' to our flags. (Which will ultimately
        // instruct the test harness to unescape the input.)
        if regex.chars().any(|c| c.is_control())
            || haystack.chars().any(|c| c.is_control())
        {
            flags.insert('$');
            regex = regex.as_bytes().escape_bytes().to_string();
            haystack = haystack.as_bytes().escape_bytes().to_string();
        }

        // Fourth field contains the capturing groups, or 'NOMATCH' or an
        // error.
        let mut captures = vec![];
        if fields[3] != "NOMATCH" {
            // Some tests check for a compilation error to occur, but we skip
            // these for now. We might consider adding them manually, or better
            // yet, just adding support for them here.
            if !fields[3].contains(',') {
                log::trace!(
                    "skipping {line_number}: malformed capturing group"
                );
                return Ok(None);
            }
            let noparen = fields[3]
                .split(")(")
                .map(|x| x.trim_matches(|c| c == '(' || c == ')'));
            for group in noparen {
                let (start, end) = match group.split_once(',') {
                    Some((start, end)) => (start.trim(), end.trim()),
                    None => anyhow::bail!(
                        "line {}: invalid capturing group '{}' in '{}'",
                        line_number,
                        group,
                        fields[3]
                    ),
                };
                if start == "?" && end == "?" {
                    captures.push(None);
                } else {
                    let start = start.parse()?;
                    let end = end.parse()?;
                    captures.push(Some((start, end)));
                }
            }
        }

        // The fifth field is optional and contains some notes. Currently, this
        // is used to indicate tests added or modified by particular regex
        // implementations.
        let re2go = fields.len() >= 5 && fields[4] == "RE2/Go";
        let rust = fields.len() >= 5 && fields[4] == "Rust";

        Ok(Some(DatTest {
            line_number,
            flags,
            regex,
            haystack,
            captures,
            re2go,
            rust,
        }))
    }
}

fn count_capturing_groups(pattern: &str) -> anyhow::Result<usize> {
    let ast = regex_syntax::ast::parse::Parser::new()
        .parse(pattern)
        .with_context(|| format!("failed to parse '{pattern}'"))?;
    // We add 1 to account for the capturing group for the entire
    // pattern.
    Ok(1 + count_capturing_groups_ast(&ast))
}

fn count_capturing_groups_ast(ast: &regex_syntax::ast::Ast) -> usize {
    use regex_syntax::ast::Ast;
    match *ast {
        Ast::Empty(_)
        | Ast::Flags(_)
        | Ast::Literal(_)
        | Ast::Dot(_)
        | Ast::Assertion(_)
        | Ast::ClassUnicode(_)
        | Ast::ClassPerl(_)
        | Ast::ClassBracketed(_) => 0,
        Ast::Repetition(ref rep) => count_capturing_groups_ast(&*rep.ast),
        Ast::Group(ref group) => {
            let this = if group.is_capturing() { 1 } else { 0 };
            this + count_capturing_groups_ast(&*group.ast)
        }
        Ast::Alternation(ref alt) => {
            alt.asts.iter().map(count_capturing_groups_ast).sum()
        }
        Ast::Concat(ref concat) => {
            concat.asts.iter().map(count_capturing_groups_ast).sum()
        }
    }
}