use std::borrow::Borrow;
use {
anyhow::Context,
lexopt::{Arg, Parser},
regex_automata::{nfa::thompson, util::look::LookMatcher},
regex_syntax::hir::Hir,
};
use crate::args::{self, flags, Configurable, Usage};
#[derive(Debug, Default)]
pub struct Config {
thompson: thompson::Config,
}
impl Config {
pub fn thompson(&self) -> anyhow::Result<thompson::Config> {
Ok(self.thompson.clone())
}
pub fn reversed(&self) -> Config {
let thompson = self
.thompson
.clone()
.reverse(true)
.which_captures(thompson::WhichCaptures::None);
Config { thompson }
}
pub fn from_hirs<H: Borrow<Hir>>(
&self,
hirs: &[H],
) -> anyhow::Result<thompson::NFA> {
thompson::Compiler::new()
.configure(self.thompson()?)
.build_many_from_hir(hirs)
.context("failed to compile Thompson NFA")
}
}
impl Configurable for Config {
fn configure(
&mut self,
p: &mut Parser,
arg: &mut Arg,
) -> anyhow::Result<bool> {
match *arg {
Arg::Short('B') | Arg::Long("no-utf8-nfa") => {
self.thompson = self.thompson.clone().utf8(false);
}
Arg::Short('r') | Arg::Long("reverse") => {
self.thompson = self.thompson.clone().reverse(true);
}
Arg::Long("nfa-size-limit") => {
let limit = args::parse_maybe(p, "--nfa-size-limit")?;
self.thompson = self.thompson.clone().nfa_size_limit(limit);
}
Arg::Long("shrink") => {
self.thompson = self.thompson.clone().shrink(true);
}
Arg::Long("captures") => {
let which: flags::WhichCaptures =
args::parse(p, "--captures")?;
self.thompson =
self.thompson.clone().which_captures(which.which);
}
Arg::Long("line-terminator") => {
let byte: flags::OneByte =
args::parse(p, "--line-terminator")?;
let mut lookm = LookMatcher::new();
lookm.set_line_terminator(byte.0);
self.thompson = self.thompson.clone().look_matcher(lookm);
}
_ => return Ok(false),
}
Ok(true)
}
fn usage(&self) -> &[Usage] {
const USAGES: &'static [Usage] = &[
Usage::new(
"-B, --no-utf8-nfa",
"Disable UTF-8 mode for empty matches.",
r#"
Disables UTF-8 mode for empty matches. When this flag is given, empty matches
that split a codepoint are permitted. Otherwise, they are banned.
"#,
),
Usage::new(
"-r, --reverse",
"Build a reverse Thompson NFA.",
r#"
Build a reverse Thompson NFA. The reverse NFA matches the language described
by the corresponding forward NFA, but in reverse. In general, this is done by
reversing the concatenations of the regex and inverting look-around assertions
that depend on the direction of matching. So for example, `^` becomes `$` and
`$` becomes `^`. But `\b` and `\B` remain the same.
Note that at time of writing, using this flag requires also using
--no-captures.
"#,
),
Usage::new(
"--nfa-size-limit",
"Sets a limit on the memory used by an NFA.",
r#"
Sets a limit on the memory used by an NFA, in terms of bytes of heap usage.
This limit is applied during NFA construction. If the limit is exceeded, then
construction will fail.
A special 'none' value disables the limit entirely.
"#,
),
Usage::new(
"--shrink",
"Enables NFA shrinking.",
r#"
This flag enables NFA shrinking. At time of writing, this is an expensive
process that only applies to reverse NFA compilation. The process may get
cheaper or more widely applicable in the future, but it generally results
in making the state graph of large Unicode character classes much smaller.
Moreover, if you're building a fully compiled reverse DFA, the extra time
spent shrinking the NFA can lead to far larger savings in the subsequent DFA
determinization.
"#,
),
flags::WhichCaptures::USAGE,
Usage::new(
"--line-terminator",
"Set the line terminator used by line anchors.",
r#"
Set the line terminator used by line anchors. The line anchors are '(?m:^)' and
'(?m:$)'. By default, they both use '\n' as line terminators for matching
purposes. This option changes the line terminator to any arbitrary byte.
Note that CRLF aware line anchors, that is, '(?Rm:^)' and '(?Rm:$)', are
unaffected by this option. CRLF aware line anchors always use '\r' and '\n'
as line terminators and do not match between a '\r' and '\n'.
"#,
),
];
USAGES
}
}