use bstr::ByteVec;
use crate::args::Usage;
#[derive(Debug, Default)]
pub struct OneByte(pub u8);
impl std::str::FromStr for OneByte {
type Err = anyhow::Error;
fn from_str(s: &str) -> anyhow::Result<OneByte> {
let bytes = Vec::unescape_bytes(s);
anyhow::ensure!(
bytes.len() == 1,
"expected exactly one byte, but got {} bytes",
bytes.len(),
);
Ok(OneByte(bytes[0]))
}
}
#[derive(Debug, Default)]
pub struct ByteSet(pub Vec<u8>);
impl std::str::FromStr for ByteSet {
type Err = anyhow::Error;
fn from_str(s: &str) -> anyhow::Result<ByteSet> {
let mut set = vec![];
let mut seen = [false; 256];
for &byte in Vec::unescape_bytes(s).iter() {
anyhow::ensure!(
!seen[usize::from(byte)],
"saw duplicate byte 0x{byte:2X} in '{s}'",
);
seen[usize::from(byte)] = true;
set.push(byte);
}
set.sort();
Ok(ByteSet(set))
}
}
#[derive(Debug)]
pub struct StartKind {
pub kind: regex_automata::dfa::StartKind,
}
impl StartKind {
pub const USAGE: Usage = Usage::new(
"--start-kind <kind>",
"One of: both, unanchored, anchored.",
r#"
Sets the start states supported by a DFA. The default is 'both', but it can
be set to either 'unanchored' or 'anchored'. The benefit of only supporting
unanchored or anchored start states is that it usually leads to a smaller
overall automaton.
"#,
);
}
impl Default for StartKind {
fn default() -> StartKind {
StartKind { kind: regex_automata::dfa::StartKind::Both }
}
}
impl std::str::FromStr for StartKind {
type Err = anyhow::Error;
fn from_str(s: &str) -> anyhow::Result<StartKind> {
let kind = match s {
"both" => regex_automata::dfa::StartKind::Both,
"unanchored" => regex_automata::dfa::StartKind::Unanchored,
"anchored" => regex_automata::dfa::StartKind::Anchored,
unk => anyhow::bail!("unrecognized start kind '{unk}'"),
};
Ok(StartKind { kind })
}
}
#[derive(Debug)]
pub struct MatchKind {
pub kind: regex_automata::MatchKind,
}
impl MatchKind {
pub const USAGE: Usage = Usage::new(
"-k, --match-kind <kind>",
"One of: leftmost-first, all.",
r#"
Selects the match semantics for the regex engine. The choices are
'leftmost-first' (the default) or 'all'.
'leftmost-first' semantics look for the leftmost match, and when there are
multiple leftmost matches, match priority disambiguates them. For example,
in the haystack 'samwise', the regex 'samwise|sam' will match 'samwise' when
using leftmost-first semantics. Similarly, the regex 'sam|samwise' will match
'sam'.
'all' semantics results in including all possible match states in the
underlying automaton. When performing an unanchored leftmost search, this has
the effect of finding the last match, which is usually not what you want.
When performing an anchored leftmost search, it has the effect of finding the
longest possible match, which might be what you want. (So there is no support
for greedy vs non-greedy searching. Everything is greedy.) 'all' is also useful
for overlapping searches, since all matches are reportable in this scheme.
"#,
);
}
impl Default for MatchKind {
fn default() -> MatchKind {
MatchKind { kind: regex_automata::MatchKind::LeftmostFirst }
}
}
impl std::str::FromStr for MatchKind {
type Err = anyhow::Error;
fn from_str(s: &str) -> anyhow::Result<MatchKind> {
let kind = match s {
"leftmost-first" => regex_automata::MatchKind::LeftmostFirst,
"all" => regex_automata::MatchKind::All,
unk => anyhow::bail!("unrecognized match kind '{unk}'"),
};
Ok(MatchKind { kind })
}
}
#[derive(Debug)]
pub struct WhichCaptures {
pub which: regex_automata::nfa::thompson::WhichCaptures,
}
impl WhichCaptures {
pub const USAGE: Usage = Usage::new(
"--captures <which>",
"One of: all, implicit or none.",
r#"
Selects which capture states should be included in the Thompson NFA. The
choices are 'all' (the default), 'implicit' or 'none'.
'all' means that both explicit and implicit capture states are included.
'implicit' means that only implicit capture states are included. That is, the
Thompson NFA will only be able to report the overall match offsets and not the
match offsets of each explicit capture group.
'none' means that no capture states will be included. This is useful when
capture states aren't needed (like when building a DFA) or if they aren't
supported (like when building a reverse NFA).
"#,
);
}
impl Default for WhichCaptures {
fn default() -> WhichCaptures {
WhichCaptures {
which: regex_automata::nfa::thompson::WhichCaptures::All,
}
}
}
impl std::str::FromStr for WhichCaptures {
type Err = anyhow::Error;
fn from_str(s: &str) -> anyhow::Result<WhichCaptures> {
let which = match s {
"all" => regex_automata::nfa::thompson::WhichCaptures::All,
"implicit" => {
regex_automata::nfa::thompson::WhichCaptures::Implicit
}
"none" => regex_automata::nfa::thompson::WhichCaptures::None,
unk => anyhow::bail!("unrecognized captures option '{}'", unk),
};
Ok(WhichCaptures { which })
}
}