mod compile;
mod parser;
mod vm;
#[cfg(test)]
mod tests;
use alloc::string::String;
use alloc::vec::Vec;
pub use parser::RegexError;
pub struct Regex {
prog: Vec<vm::Inst>,
scalar_prog: Vec<vm::Inst>,
group_count: usize,
group_names: alloc::vec::Vec<(usize, alloc::string::String)>,
flags: Flags,
}
#[derive(Clone, Copy, Default)]
pub struct Flags {
pub ignore_case: bool,
pub global: bool,
pub multiline: bool,
pub dotall: bool,
pub sticky: bool,
pub unicode: bool,
}
impl Flags {
pub fn parse(s: &str) -> Result<Flags, RegexError> {
let mut f = Flags::default();
for c in s.chars() {
match c {
'i' => f.ignore_case = true,
'g' => f.global = true,
'm' => f.multiline = true,
's' => f.dotall = true,
'y' => f.sticky = true,
'u' => f.unicode = true,
'd' => {} other => return Err(RegexError::new(alloc::format!("unknown flag `{other}`"))),
}
}
Ok(f)
}
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct Captures {
pub groups: Vec<Option<(usize, usize)>>,
}
impl Captures {
#[must_use]
pub fn whole(&self) -> (usize, usize) {
self.groups[0].expect("group 0 is always set on a successful match")
}
#[must_use]
pub fn group(&self, i: usize) -> Option<(usize, usize)> {
self.groups.get(i).copied().flatten()
}
}
impl Regex {
pub fn new(pattern: &str, flags: &str) -> Result<Regex, RegexError> {
let flags = Flags::parse(flags)?;
let (ast, _, group_names) = parser::parse(pattern, flags.unicode)?;
let (prog, group_count) = compile::compile(&ast, &group_names, flags.unicode)?;
let scalar_prog = if flags.unicode {
let (sp, _) = compile::compile(&ast, &group_names, true)?;
sp
} else {
let (ast_u, _, gn_u) = parser::parse(pattern, true)?;
let (sp, _) = compile::compile(&ast_u, &gn_u, true)?;
sp
};
Ok(Regex {
prog,
scalar_prog,
group_count,
group_names,
flags,
})
}
#[must_use]
pub fn group_count(&self) -> usize {
self.group_count
}
#[must_use]
pub fn group_names(&self) -> &[(usize, alloc::string::String)] {
&self.group_names
}
#[must_use]
pub fn flags(&self) -> Flags {
self.flags
}
#[must_use]
pub fn is_match(&self, text: &str) -> bool {
self.captures_from(text, 0).is_some()
}
#[must_use]
pub fn captures_from(&self, text: &str, start: usize) -> Option<Captures> {
let chars: Vec<char> = text.chars().collect();
self.captures_at(&chars, start)
}
#[must_use]
pub fn find_from(&self, text: &str, start: usize) -> Option<(usize, usize)> {
self.captures_from(text, start).map(|c| c.whole())
}
#[must_use]
pub fn captures_in(&self, chars: &[char], start: usize) -> Option<Captures> {
self.captures_at(chars, start)
}
#[must_use]
pub fn find_in(&self, chars: &[char], start: usize) -> Option<(usize, usize)> {
self.captures_at(chars, start).map(|c| c.whole())
}
#[must_use]
pub fn captures_in_u16(&self, units: &[u16], start: usize) -> Option<Captures> {
self.captures_at_u16(units, start)
}
#[must_use]
pub fn find_in_u16(&self, units: &[u16], start: usize) -> Option<(usize, usize)> {
self.captures_at_u16(units, start).map(|c| c.whole())
}
fn captures_at_u16(&self, units: &[u16], start: usize) -> Option<Captures> {
self.scan(&self.prog, units, start, self.flags)
}
fn captures_at(&self, chars: &[char], start: usize) -> Option<Captures> {
let mut units: Vec<u16> = Vec::with_capacity(chars.len());
let mut char_to_unit: Vec<usize> = Vec::with_capacity(chars.len() + 1);
let mut buf = [0u16; 2];
for &c in chars {
char_to_unit.push(units.len());
units.extend_from_slice(c.encode_utf16(&mut buf));
}
char_to_unit.push(units.len());
let unit_start = *char_to_unit.get(start).unwrap_or(&units.len());
let mut adapter_flags = self.flags;
adapter_flags.unicode = true;
let caps = self.scan(&self.scalar_prog, &units, unit_start, adapter_flags)?;
let to_char = |u: usize| -> usize {
match char_to_unit.binary_search(&u) {
Ok(i) => i,
Err(i) => i.saturating_sub(1),
}
};
let groups = caps
.groups
.into_iter()
.map(|g| g.map(|(s, e)| (to_char(s), to_char(e))))
.collect();
Some(Captures { groups })
}
fn scan(
&self,
prog: &[vm::Inst],
units: &[u16],
start: usize,
flags: Flags,
) -> Option<Captures> {
let last = if flags.sticky { start } else { units.len() };
let steps = core::cell::Cell::new(0u64);
let budget = vm::budget_for(units.len());
for s in start..=last {
if let Some(groups) =
vm::run_shared(prog, units, s, self.group_count, flags, &steps, budget)
{
return Some(Captures { groups });
}
}
None
}
#[must_use]
pub fn replace(&self, text: &str, replacement: &str) -> String {
let chars: Vec<char> = text.chars().collect();
let mut out = String::new();
let mut pos = 0;
while pos <= chars.len() {
let Some(caps) = self.captures_at(&chars, pos) else {
break;
};
let (ms, me) = caps.whole();
out.extend(&chars[pos..ms]);
expand_replacement(replacement, &chars, &caps, &mut out);
if me > ms {
pos = me;
} else {
if me < chars.len() {
out.push(chars[me]);
}
pos = me + 1;
}
if !self.flags.global {
break;
}
}
out.extend(&chars[pos.min(chars.len())..]);
out
}
}
fn expand_replacement(template: &str, chars: &[char], caps: &Captures, out: &mut String) {
let t: Vec<char> = template.chars().collect();
let mut i = 0;
while i < t.len() {
if t[i] == '$' && i + 1 < t.len() {
match t[i + 1] {
'&' => {
let (s, e) = caps.whole();
out.extend(&chars[s..e]);
i += 2;
continue;
}
d @ '1'..='9' => {
let idx = d as usize - '0' as usize;
if let Some((s, e)) = caps.group(idx) {
out.extend(&chars[s..e]);
}
i += 2;
continue;
}
'$' => {
out.push('$');
i += 2;
continue;
}
_ => {}
}
}
out.push(t[i]);
i += 1;
}
}