use std::{
collections::HashSet,
fs::File,
io::{BufRead, Read, Write},
path::{Path, PathBuf},
};
use {
anyhow::Context,
bstr::ByteSlice,
lexopt::{Arg, Parser},
};
use crate::args::{self, Usage};
pub fn run(p: &mut Parser) -> anyhow::Result<()> {
const USAGE: &'static str = r#"
Generate TOML tests from Glenn Fowler's regex test suite.
This corresponds to a very sizeable set of regex tests that were written many
moons ago. They have been tweaked slightly by both Russ Cox and myself (Andrew
Gallant).
This tool is the spiritual successor of some hacky Python scripts. Its input is
a bespoke plain text format matching the original test data, and its output are
TOML files meant to work with the 'regex-test' crate.
Example usage from the root of this repository:
regex-cli generate fowler tests/data/fowler tests/data/fowler/dat/*.dat
See tests/data/fowler/dat/README for more context.
USAGE:
regex-cli generate fowler <outdir> <datfile> ...
outdir should be a path to a directory where you want the TOML files
to be written. datfile should be a Glenn Fowler dat file whose format
is bespoke.
"#;
let mut config = Config::default();
args::configure(p, USAGE, &mut [&mut config])?;
let outdir = config.outdir()?;
let datfiles = config.datfiles()?;
for datfile in datfiles.iter() {
let datfile = Path::new(datfile);
let stem = match datfile.file_stem() {
Some(stem) => stem.to_string_lossy(),
None => anyhow::bail!("{}: has no file stem", datfile.display()),
};
let tomlfile = outdir.join(format!("{stem}.toml"));
let mut rdr = File::open(datfile)
.with_context(|| datfile.display().to_string())?;
let mut wtr = File::create(&tomlfile)
.with_context(|| tomlfile.display().to_string())?;
convert(&stem, &mut rdr, &mut wtr)
.with_context(|| stem.to_string())?;
}
Ok(())
}
#[derive(Debug, Default)]
struct Config {
outdir: Option<PathBuf>,
datfiles: Vec<PathBuf>,
}
impl Config {
fn outdir(&self) -> anyhow::Result<&Path> {
self.outdir
.as_deref()
.ok_or_else(|| anyhow::anyhow!("missing <outdir>"))
}
fn datfiles(&self) -> anyhow::Result<&[PathBuf]> {
anyhow::ensure!(!self.datfiles.is_empty(), "no Fowler datfiles given",);
Ok(&self.datfiles)
}
}
impl args::Configurable for Config {
fn configure(
&mut self,
_: &mut Parser,
arg: &mut Arg,
) -> anyhow::Result<bool> {
match *arg {
Arg::Value(ref mut value) => {
let path = PathBuf::from(std::mem::take(value));
if self.outdir.is_none() {
self.outdir = Some(path);
} else {
self.datfiles.push(path);
}
}
_ => return Ok(false),
}
Ok(true)
}
fn usage(&self) -> &[Usage] {
const USAGES: &'static [Usage] = &[];
USAGES
}
}
fn convert(
mut group_name: &str,
src: &mut dyn Read,
dst: &mut dyn Write,
) -> anyhow::Result<()> {
log::trace!("processing {group_name}");
let src = std::io::BufReader::new(src);
writeln!(
dst,
"\
# !!! DO NOT EDIT !!!
# Automatically generated by 'regex-cli generate fowler'.
# Numbers in the test names correspond to the line number of the test from
# the original dat file.
"
)?;
let mut prev = None;
for (i, result) in src.lines().enumerate() {
let line_number = u64::try_from(i).unwrap().checked_add(1).unwrap();
let line = result.with_context(|| format!("line {line_number}"))?;
if group_name == "repetition" && line.contains("Chris Kuklewicz") {
group_name = "repetition-expensive";
}
if line.trim().is_empty() || line.starts_with('#') {
continue;
}
let dat = match DatTest::parse(prev.as_ref(), line_number, &line)? {
None => continue,
Some(dat) => dat,
};
let toml = TomlTest::from_dat_test(group_name, &dat)?;
writeln!(dst, "{toml}")?;
prev = Some(dat);
}
Ok(())
}
#[derive(Debug)]
struct TomlTest {
group_name: String,
line_number: u64,
regex: String,
haystack: String,
captures: Vec<Option<(u64, u64)>>,
unescape: bool,
case_insensitive: bool,
comment: Option<String>,
}
impl TomlTest {
fn from_dat_test(
group_name: &str,
dat: &DatTest,
) -> anyhow::Result<TomlTest> {
let mut captures = dat.captures.clone();
if !captures.is_empty() {
let numcaps = count_capturing_groups(&dat.regex)?;
for _ in captures.len()..numcaps {
captures.push(None);
}
}
let comment = if dat.re2go {
Some("Test added by RE2/Go project.".to_string())
} else if dat.rust {
Some("Test added by Rust regex project.".to_string())
} else {
None
};
Ok(TomlTest {
group_name: group_name.to_string(),
line_number: dat.line_number,
regex: dat.regex.clone(),
haystack: dat.haystack.clone(),
captures,
unescape: dat.flags.contains(&'$'),
case_insensitive: dat.flags.contains(&'i'),
comment,
})
}
}
impl core::fmt::Display for TomlTest {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
if let Some(ref comment) = self.comment {
writeln!(f, "# {comment}")?;
}
writeln!(f, "[[test]]")?;
writeln!(f, "name = \"{}{}\"", self.group_name, self.line_number)?;
writeln!(f, "regex = '''{}'''", self.regex)?;
writeln!(f, "haystack = '''{}'''", self.haystack)?;
if self.captures.is_empty() {
writeln!(f, "matches = []")?;
} else {
write!(f, "matches = [[")?;
for (i, &group) in self.captures.iter().enumerate() {
if i > 0 {
write!(f, ", ")?;
}
match group {
None => write!(f, "[]")?,
Some((start, end)) => write!(f, "[{start}, {end}]")?,
}
}
writeln!(f, "]]")?;
}
writeln!(f, "match-limit = 1")?;
if self
.captures
.get(0)
.and_then(|&s| s)
.map_or(false, |span| span.0 == 0)
{
writeln!(f, "anchored = true")?;
}
if self.unescape {
writeln!(f, "unescape = true")?;
}
if self.case_insensitive {
writeln!(f, "case-insensitive = true")?;
}
Ok(())
}
}
#[derive(Debug)]
struct DatTest {
line_number: u64,
flags: HashSet<char>,
regex: String,
haystack: String,
captures: Vec<Option<(u64, u64)>>,
re2go: bool,
rust: bool,
}
impl DatTest {
fn parse(
prev: Option<&DatTest>,
line_number: u64,
line: &str,
) -> anyhow::Result<Option<DatTest>> {
let fields: Vec<String> = line
.split('\t')
.map(|f| f.trim().to_string())
.filter(|f| !f.is_empty())
.collect();
if !(4 <= fields.len() && fields.len() <= 5) {
log::trace!(
"skipping {}: too few or too many fields ({})",
line_number,
fields.len(),
);
return Ok(None);
}
let mut flags: HashSet<char> = fields[0].chars().collect();
if !flags.contains(&'E') {
log::trace!("skipping {line_number}: does not contain 'E' flag");
return Ok(None);
}
let mut regex = fields[1].clone();
if regex == "SAME" {
regex = match prev {
Some(test) => test.regex.clone(),
None => anyhow::bail!(
"line {line_number}: wants previous pattern \
but none is available"
),
};
}
let mut haystack = fields[2].clone();
if haystack == "NULL" {
haystack = "".to_string();
}
if regex.chars().any(|c| c.is_control())
|| haystack.chars().any(|c| c.is_control())
{
flags.insert('$');
regex = regex.as_bytes().escape_bytes().to_string();
haystack = haystack.as_bytes().escape_bytes().to_string();
}
let mut captures = vec![];
if fields[3] != "NOMATCH" {
if !fields[3].contains(',') {
log::trace!(
"skipping {line_number}: malformed capturing group"
);
return Ok(None);
}
let noparen = fields[3]
.split(")(")
.map(|x| x.trim_matches(|c| c == '(' || c == ')'));
for group in noparen {
let (start, end) = match group.split_once(',') {
Some((start, end)) => (start.trim(), end.trim()),
None => anyhow::bail!(
"line {}: invalid capturing group '{}' in '{}'",
line_number,
group,
fields[3]
),
};
if start == "?" && end == "?" {
captures.push(None);
} else {
let start = start.parse()?;
let end = end.parse()?;
captures.push(Some((start, end)));
}
}
}
let re2go = fields.len() >= 5 && fields[4] == "RE2/Go";
let rust = fields.len() >= 5 && fields[4] == "Rust";
Ok(Some(DatTest {
line_number,
flags,
regex,
haystack,
captures,
re2go,
rust,
}))
}
}
fn count_capturing_groups(pattern: &str) -> anyhow::Result<usize> {
let ast = regex_syntax::ast::parse::Parser::new()
.parse(pattern)
.with_context(|| format!("failed to parse '{pattern}'"))?;
Ok(1 + count_capturing_groups_ast(&ast))
}
fn count_capturing_groups_ast(ast: ®ex_syntax::ast::Ast) -> usize {
use regex_syntax::ast::Ast;
match *ast {
Ast::Empty(_)
| Ast::Flags(_)
| Ast::Literal(_)
| Ast::Dot(_)
| Ast::Assertion(_)
| Ast::ClassUnicode(_)
| Ast::ClassPerl(_)
| Ast::ClassBracketed(_) => 0,
Ast::Repetition(ref rep) => count_capturing_groups_ast(&*rep.ast),
Ast::Group(ref group) => {
let this = if group.is_capturing() { 1 } else { 0 };
this + count_capturing_groups_ast(&*group.ast)
}
Ast::Alternation(ref alt) => {
alt.asts.iter().map(count_capturing_groups_ast).sum()
}
Ast::Concat(ref concat) => {
concat.asts.iter().map(count_capturing_groups_ast).sum()
}
}
}