use std::collections::{HashMap, HashSet};
use crate::emacs_core::{emacs_char, syntax::SyntaxClass};
use smallvec::SmallVec;
const INLINE_REGEX_REGISTERS: usize = 8;
type RegisterScratch = SmallVec<[Option<usize>; INLINE_REGEX_REGISTERS]>;
type SavedRegisters = SmallVec<[(usize, i64, i64); INLINE_REGEX_REGISTERS]>;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[repr(u8)]
pub(crate) enum RegexOp {
NoOp = 0,
Succeed = 1,
Exactn = 2,
AnyChar = 3,
Charset = 4,
CharsetNot = 5,
StartMemory = 6,
StopMemory = 7,
Duplicate = 8,
BegLine = 9,
EndLine = 10,
BegBuf = 11,
EndBuf = 12,
Jump = 13,
OnFailureJump = 14,
OnFailureKeepStringJump = 15,
OnFailureJumpLoop = 16,
OnFailureJumpNastyloop = 17,
OnFailureJumpSmart = 18,
SucceedN = 19,
JumpN = 20,
SetNumberAt = 21,
WordBeg = 22,
WordEnd = 23,
WordBound = 24,
NotWordBound = 25,
SymBeg = 26,
SymEnd = 27,
SyntaxSpec = 28,
NotSyntaxSpec = 29,
AtDot = 30,
CategorySpec = 31,
NotCategorySpec = 32,
}
impl RegexOp {
fn from_byte(b: u8) -> Option<Self> {
if b <= 32 {
Some(unsafe { std::mem::transmute(b) })
} else {
None
}
}
}
#[derive(Clone)]
pub(crate) struct CompiledPattern {
pub buffer: Vec<u8>,
pub re_nsub: usize,
pub fastmap: [bool; 256],
pub fastmap_accurate: bool,
pub posix: bool,
pub multibyte: bool,
pub target_multibyte: bool,
pub can_be_null: bool,
pub translate: Option<Vec<char>>,
pub multibyte_charsets: HashMap<usize, Vec<(char, char)>>,
pub charset_class_bits: HashMap<usize, u8>,
}
pub const CHARSET_CLASS_BIT_WORD: u8 = 1 << 0;
pub const CHARSET_CLASS_BIT_SPACE: u8 = 1 << 1;
impl CompiledPattern {
pub fn new() -> Self {
Self {
buffer: Vec::with_capacity(256),
re_nsub: 0,
fastmap: [false; 256],
fastmap_accurate: false,
posix: false,
multibyte: true,
target_multibyte: true,
can_be_null: false,
translate: None,
multibyte_charsets: HashMap::new(),
charset_class_bits: HashMap::new(),
}
}
}
#[derive(Clone, Debug)]
pub(crate) struct MatchRegisters {
pub start: Vec<i64>,
pub end: Vec<i64>,
}
impl MatchRegisters {
pub fn new(num_groups: usize) -> Self {
Self {
start: vec![-1; num_groups],
end: vec![-1; num_groups],
}
}
pub fn num_regs(&self) -> usize {
self.start.len()
}
}
#[derive(Clone, Debug)]
struct FailurePoint {
pattern_pos: usize,
string_pos: Option<usize>,
saved_registers: SavedRegisters,
saved_counters: HashMap<usize, i16>,
}
fn store_number(buf: &mut [u8], pos: usize, number: i16) {
let bytes = number.to_le_bytes();
buf[pos] = bytes[0];
buf[pos + 1] = bytes[1];
}
fn extract_number(buf: &[u8], pos: usize) -> i16 {
i16::from_le_bytes([buf[pos], buf[pos + 1]])
}
fn get_counter(counters: &HashMap<usize, i16>, bytecode: &[u8], pos: usize) -> i16 {
counters
.get(&pos)
.copied()
.unwrap_or_else(|| extract_number(bytecode, pos))
}
fn set_counter(counters: &mut HashMap<usize, i16>, pos: usize, val: i16) {
counters.insert(pos, val);
}
#[derive(Debug, Clone)]
pub(crate) struct RegexCompileError {
pub message: String,
}
impl std::fmt::Display for RegexCompileError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "Invalid regexp: {}", self.message)
}
}
#[derive(Clone, Debug)]
struct CompileStackEntry {
begalt_offset: usize,
fixup_alt_jump: Option<usize>,
laststart_offset: Option<usize>,
regnum: usize,
assigned_group: Option<usize>,
group_bytecode_start: usize,
}
pub(crate) fn regex_compile(
pattern: &str,
posix: bool,
case_fold: bool,
) -> Result<CompiledPattern, RegexCompileError> {
let pattern = crate::heap_types::LispString::from_utf8(pattern);
regex_compile_lisp(&pattern, posix, case_fold)
}
pub(crate) fn regex_compile_lisp(
pattern: &crate::heap_types::LispString,
posix: bool,
case_fold: bool,
) -> Result<CompiledPattern, RegexCompileError> {
let mut buf = CompiledPattern::new();
buf.posix = posix;
buf.multibyte = pattern.is_multibyte();
buf.target_multibyte = pattern.is_multibyte();
if case_fold {
let mut table = Vec::with_capacity(256);
for i in 0..256u32 {
let c = char::from_u32(i).expect("0..=255 must be valid Unicode scalars");
table.push(c.to_lowercase().next().unwrap_or(c));
}
buf.translate = Some(table);
}
let pattern_bytes = pattern.as_bytes();
let plen = pattern_bytes.len();
let mut p = 0;
let mut compile_stack: Vec<CompileStackEntry> = Vec::new();
let mut regnum: usize = 0;
let mut begalt_offset: usize = 0; let mut pending_exact: Option<usize> = None; let mut laststart: Option<usize> = None; let mut laststart_is_group = false; let mut fixup_alt_jump: Option<usize> = None;
macro_rules! emit {
($byte:expr) => {
buf.buffer.push($byte);
};
}
macro_rules! emit_op {
($op:expr) => {
buf.buffer.push($op as u8);
};
}
macro_rules! bpos {
() => {
buf.buffer.len()
};
}
#[allow(unused_macros)]
macro_rules! pat_fetch {
() => {{
if p >= plen {
return Err(RegexCompileError {
message: "premature end of pattern".to_string(),
});
}
let c = pattern_bytes[p];
p += 1;
c
}};
}
while p < plen {
let c = pattern_bytes[p];
p += 1;
match c {
b'^' => {
laststart = None;
pending_exact = None;
emit_op!(RegexOp::BegLine);
}
b'$' => {
laststart = Some(bpos!());
pending_exact = None;
emit_op!(RegexOp::EndLine);
}
b'.' => {
laststart = Some(bpos!());
pending_exact = None;
emit_op!(RegexOp::AnyChar);
}
b'*' | b'+' | b'?' => {
let Some(mut last) = laststart else {
goto_normal_char(
c as u32,
&mut buf,
&mut pending_exact,
&mut laststart,
&mut laststart_is_group,
);
continue;
};
let last_is_group = laststart_is_group;
if !last_is_group && buf.buffer[last] == RegexOp::Exactn as u8 {
let count_pos = last + 1;
let count = buf.buffer[count_pos] as usize;
let exact_start = count_pos + 1;
let exact_end = exact_start + count;
let exact_bytes = &buf.buffer[exact_start..exact_end];
let last_char_start = if buf.multibyte {
let mut rel = 0;
let mut previous = 0;
let mut chars = 0;
while rel < exact_bytes.len() {
previous = rel;
let (_, len) = emacs_char::string_char(&exact_bytes[rel..]);
rel += len;
chars += 1;
}
if chars > 1 { Some(previous) } else { None }
} else if count > 1 {
Some(count - 1)
} else {
None
};
if let Some(split_start) = last_char_start {
let split_bytes = exact_bytes[split_start..].to_vec();
buf.buffer.truncate(exact_start + split_start);
buf.buffer[count_pos] = split_start as u8;
last = buf.buffer.len();
buf.buffer.push(RegexOp::Exactn as u8);
buf.buffer.push(split_bytes.len() as u8);
buf.buffer.extend_from_slice(&split_bytes);
}
}
let greedy = if p < plen && pattern_bytes[p] == b'?' {
p += 1; false
} else {
true
};
compile_repetition(c, greedy, posix, last, &mut buf)?;
laststart = None; laststart_is_group = false;
pending_exact = None;
}
b'[' => {
laststart = Some(bpos!());
pending_exact = None;
let pattern_multibyte = buf.multibyte;
compile_charset(
pattern_bytes,
&mut p,
&mut buf,
case_fold,
pattern_multibyte,
)?;
}
b'\\' => {
if p >= plen {
return Err(RegexCompileError {
message: "trailing backslash".to_string(),
});
}
let c2 = pattern_bytes[p];
p += 1;
match c2 {
b'(' => {
let shy = p + 1 < plen
&& pattern_bytes[p] == b'?'
&& pattern_bytes[p + 1] == b':';
if shy {
p += 2; }
let mut explicit_group: Option<usize> = None;
if !shy && p < plen && pattern_bytes[p] == b'?' {
let saved_p = p;
p += 1; let num_start = p;
while p < plen && pattern_bytes[p].is_ascii_digit() {
p += 1;
}
if p > num_start && p < plen && pattern_bytes[p] == b':' {
let num_str = std::str::from_utf8(&pattern_bytes[num_start..p])
.unwrap_or("0");
if let Ok(n) = num_str.parse::<usize>() {
explicit_group = Some(n);
p += 1; }
}
if explicit_group.is_none() {
p = saved_p; }
}
let is_shy = shy;
let group_start = bpos!();
let assigned = if let Some(n) = explicit_group {
Some(n)
} else if !is_shy {
Some(regnum + 1)
} else {
None
};
compile_stack.push(CompileStackEntry {
begalt_offset,
fixup_alt_jump,
laststart_offset: laststart,
regnum,
assigned_group: assigned,
group_bytecode_start: group_start,
});
if let Some(n) = explicit_group {
while buf.re_nsub < n {
buf.re_nsub += 1;
}
regnum = n;
emit_op!(RegexOp::StartMemory);
emit!(n as u8);
} else if !is_shy {
regnum += 1;
buf.re_nsub += 1;
emit_op!(RegexOp::StartMemory);
emit!(regnum as u8);
}
begalt_offset = bpos!();
laststart = None;
fixup_alt_jump = None;
pending_exact = None;
}
b')' => {
let Some(entry) = compile_stack.pop() else {
return Err(RegexCompileError {
message: "unmatched \\)".to_string(),
});
};
if let Some(fixup) = fixup_alt_jump {
let target = bpos!() as i16 - fixup as i16 - 2;
store_number(&mut buf.buffer, fixup, target);
}
if let Some(group_num) = entry.assigned_group {
emit_op!(RegexOp::StopMemory);
emit!(group_num as u8);
}
begalt_offset = entry.begalt_offset;
fixup_alt_jump = entry.fixup_alt_jump;
laststart = Some(entry.group_bytecode_start);
laststart_is_group = true;
pending_exact = None;
}
b'|' => {
pending_exact = None;
emit_op!(RegexOp::Jump);
let jump_pos = bpos!();
emit!(0);
emit!(0);
if let Some(fixup) = fixup_alt_jump {
let target = bpos!() as i16 - fixup as i16 - 2;
store_number(&mut buf.buffer, fixup, target);
}
let alt_start = begalt_offset;
buf.buffer
.splice(alt_start..alt_start, [RegexOp::OnFailureJump as u8, 0, 0]);
let target = (bpos!() - alt_start - 3) as i16;
store_number(&mut buf.buffer, alt_start + 1, target);
fixup_alt_jump = Some(jump_pos + 3);
begalt_offset = bpos!();
laststart = None;
}
b'`' => {
laststart = Some(bpos!());
pending_exact = None;
emit_op!(RegexOp::BegBuf);
}
b'\'' => {
laststart = Some(bpos!());
pending_exact = None;
emit_op!(RegexOp::EndBuf);
}
b'=' => {
laststart = Some(bpos!());
pending_exact = None;
emit_op!(RegexOp::AtDot);
}
b'b' => {
laststart = Some(bpos!());
pending_exact = None;
emit_op!(RegexOp::WordBound);
}
b'B' => {
laststart = Some(bpos!());
pending_exact = None;
emit_op!(RegexOp::NotWordBound);
}
b'<' => {
laststart = Some(bpos!());
pending_exact = None;
emit_op!(RegexOp::WordBeg);
}
b'>' => {
laststart = Some(bpos!());
pending_exact = None;
emit_op!(RegexOp::WordEnd);
}
b'_' => {
if p < plen {
let c3 = pattern_bytes[p];
p += 1;
match c3 {
b'<' => {
laststart = Some(bpos!());
pending_exact = None;
emit_op!(RegexOp::SymBeg);
}
b'>' => {
laststart = Some(bpos!());
pending_exact = None;
emit_op!(RegexOp::SymEnd);
}
_ => {
goto_normal_char(
b'_' as u32,
&mut buf,
&mut pending_exact,
&mut laststart,
&mut laststart_is_group,
);
}
}
}
}
b'w' => {
laststart = Some(bpos!());
pending_exact = None;
emit_op!(RegexOp::SyntaxSpec);
emit!(SyntaxClass::Word as u8);
}
b'W' => {
laststart = Some(bpos!());
pending_exact = None;
emit_op!(RegexOp::NotSyntaxSpec);
emit!(SyntaxClass::Word as u8);
}
b's' => {
if p >= plen {
return Err(RegexCompileError {
message: "\\s requires syntax class character".to_string(),
});
}
let sc = pattern_bytes[p] as char;
p += 1;
let Some(class) = SyntaxClass::from_char(sc) else {
return Err(RegexCompileError {
message: format!("invalid syntax class: \\s{sc}"),
});
};
laststart = Some(bpos!());
pending_exact = None;
emit_op!(RegexOp::SyntaxSpec);
emit!(class as u8);
}
b'S' => {
if p >= plen {
return Err(RegexCompileError {
message: "\\S requires syntax class character".to_string(),
});
}
let sc = pattern_bytes[p] as char;
p += 1;
let Some(class) = SyntaxClass::from_char(sc) else {
return Err(RegexCompileError {
message: format!("invalid syntax class: \\S{sc}"),
});
};
laststart = Some(bpos!());
pending_exact = None;
emit_op!(RegexOp::NotSyntaxSpec);
emit!(class as u8);
}
b'c' => {
if p >= plen {
return Err(RegexCompileError {
message: "\\c requires category character".to_string(),
});
}
let cat = pattern_bytes[p];
p += 1;
laststart = Some(bpos!());
pending_exact = None;
emit_op!(RegexOp::CategorySpec);
emit!(cat);
}
b'C' => {
if p >= plen {
return Err(RegexCompileError {
message: "\\C requires category character".to_string(),
});
}
let cat = pattern_bytes[p];
p += 1;
laststart = Some(bpos!());
pending_exact = None;
emit_op!(RegexOp::NotCategorySpec);
emit!(cat);
}
b'1'..=b'9' => {
let group = (c2 - b'0') as usize;
if group > buf.re_nsub {
return Err(RegexCompileError {
message: format!(
"invalid back reference \\{group}: only {} groups defined",
buf.re_nsub
),
});
}
laststart = Some(bpos!());
pending_exact = None;
emit_op!(RegexOp::Duplicate);
emit!(group as u8);
}
b'{' => {
let _interval_start = p;
let (min_count, max_count) = parse_interval(pattern_bytes, &mut p)?;
let lazy = p < plen && pattern_bytes[p] == b'?';
if lazy {
p += 1;
}
let Some(last) = laststart else {
return Err(RegexCompileError {
message: "\\{ without preceding expression".to_string(),
});
};
compile_interval(min_count, max_count, lazy, last, &mut buf)?;
laststart = None;
laststart_is_group = false;
pending_exact = None;
}
b't' => {
goto_normal_char(
b'\t' as u32,
&mut buf,
&mut pending_exact,
&mut laststart,
&mut laststart_is_group,
);
}
b'n' => {
goto_normal_char(
b'\n' as u32,
&mut buf,
&mut pending_exact,
&mut laststart,
&mut laststart_is_group,
);
}
b'r' => {
goto_normal_char(
b'\r' as u32,
&mut buf,
&mut pending_exact,
&mut laststart,
&mut laststart_is_group,
);
}
b'f' => {
goto_normal_char(
0x0c,
&mut buf,
&mut pending_exact,
&mut laststart,
&mut laststart_is_group,
);
}
b'a' => {
goto_normal_char(
0x07,
&mut buf,
&mut pending_exact,
&mut laststart,
&mut laststart_is_group,
);
}
b'e' => {
goto_normal_char(
0x1b,
&mut buf,
&mut pending_exact,
&mut laststart,
&mut laststart_is_group,
);
}
b'd' => {
laststart = Some(bpos!());
pending_exact = None;
emit_op!(RegexOp::Charset);
emit!(32);
let bitmap_start = buf.buffer.len();
buf.buffer.extend_from_slice(&[0u8; 32]);
for ch in b'0'..=b'9' {
set_bitmap_bit(&mut buf.buffer, bitmap_start, ch, None);
}
}
b'D' => {
laststart = Some(bpos!());
pending_exact = None;
emit_op!(RegexOp::CharsetNot);
emit!(32);
let bitmap_start = buf.buffer.len();
buf.buffer.extend_from_slice(&[0u8; 32]);
for ch in b'0'..=b'9' {
set_bitmap_bit(&mut buf.buffer, bitmap_start, ch, None);
}
}
_ => {
if buf.multibyte && c2 >= 0x80 {
let char_start = p - 1;
let (code, len) = decode_pattern_char(pattern_bytes, char_start, true)
.unwrap_or((c2 as u32, 1));
p = char_start + len;
goto_normal_char(
code,
&mut buf,
&mut pending_exact,
&mut laststart,
&mut laststart_is_group,
);
} else {
goto_normal_char(
c2 as u32,
&mut buf,
&mut pending_exact,
&mut laststart,
&mut laststart_is_group,
);
}
}
}
}
_ => {
if buf.multibyte && c >= 0x80 {
let char_start = p - 1;
let (code, len) = decode_pattern_char(pattern_bytes, char_start, true)
.unwrap_or((c as u32, 1));
p = char_start + len;
goto_normal_char(
code,
&mut buf,
&mut pending_exact,
&mut laststart,
&mut laststart_is_group,
);
} else {
goto_normal_char(
c as u32,
&mut buf,
&mut pending_exact,
&mut laststart,
&mut laststart_is_group,
);
}
}
}
}
if !compile_stack.is_empty() {
return Err(RegexCompileError {
message: "unmatched \\(".to_string(),
});
}
if let Some(fixup) = fixup_alt_jump {
let target = bpos!() as i16 - fixup as i16 - 2;
store_number(&mut buf.buffer, fixup, target);
}
if !posix {
emit_op!(RegexOp::Succeed);
}
compile_fastmap(&mut buf);
Ok(buf)
}
fn goto_normal_char(
c: u32,
buf: &mut CompiledPattern,
pending_exact: &mut Option<usize>,
laststart: &mut Option<usize>,
laststart_is_group: &mut bool,
) {
let c = if let Some(table) = buf.translate.as_ref() {
if (c as usize) < table.len() {
table[c as usize] as u32
} else {
c
}
} else {
c
};
let mut encoded = [0u8; emacs_char::MAX_MULTIBYTE_LENGTH];
let encoded_len = if buf.multibyte {
emacs_char::char_string(c, &mut encoded)
} else {
encoded[0] = c as u8;
1
};
if let Some(exact_pos) = *pending_exact {
let count = buf.buffer[exact_pos] as usize;
if count + encoded_len <= 255 {
buf.buffer[exact_pos] += encoded_len as u8;
buf.buffer.extend_from_slice(&encoded[..encoded_len]);
*laststart_is_group = false;
return;
}
}
*laststart = Some(buf.buffer.len());
*laststart_is_group = false;
buf.buffer.push(RegexOp::Exactn as u8);
*pending_exact = Some(buf.buffer.len());
buf.buffer.push(encoded_len as u8);
buf.buffer.extend_from_slice(&encoded[..encoded_len]);
}
fn compile_repetition(
op: u8,
greedy: bool,
_posix: bool,
laststart: usize,
buf: &mut CompiledPattern,
) -> Result<(), RegexCompileError> {
let after_last = buf.buffer.len();
match op {
b'*' => {
if greedy {
buf.buffer.splice(
laststart..laststart,
[RegexOp::OnFailureJumpLoop as u8, 0, 0],
);
let expr_len = after_last - laststart;
buf.buffer.push(RegexOp::Jump as u8);
let jpos = buf.buffer.len();
buf.buffer.push(0);
buf.buffer.push(0);
let ofjl_offset = (expr_len + 3) as i16;
store_number(&mut buf.buffer, laststart + 1, ofjl_offset);
let jump_offset = laststart as i16 - (jpos as i16 + 2);
store_number(&mut buf.buffer, jpos, jump_offset);
} else {
let expr_bytes = buf.buffer[laststart..after_last].to_vec();
let body_may_be_empty = repeated_body_may_match_empty(&expr_bytes);
buf.buffer.truncate(laststart);
buf.buffer.push(RegexOp::Jump as u8);
let jump_pos = buf.buffer.len();
buf.buffer.push(0);
buf.buffer.push(0);
let expr_start = buf.buffer.len();
buf.buffer.extend_from_slice(&expr_bytes);
if body_may_be_empty {
buf.buffer.push(RegexOp::NoOp as u8);
}
let cond_pos = buf.buffer.len();
buf.buffer.push(if body_may_be_empty {
RegexOp::OnFailureJumpNastyloop as u8
} else {
RegexOp::OnFailureJump as u8
});
let cond_arg_pos = buf.buffer.len();
buf.buffer.push(0);
buf.buffer.push(0);
let jump_offset = cond_pos as i16 - (jump_pos as i16 + 2);
store_number(&mut buf.buffer, jump_pos, jump_offset);
let cond_offset = expr_start as i16 - (cond_arg_pos as i16 + 2);
store_number(&mut buf.buffer, cond_arg_pos, cond_offset);
}
}
b'+' => {
if greedy {
buf.buffer.push(RegexOp::OnFailureJumpLoop as u8);
let ofjl_pos = buf.buffer.len();
buf.buffer.push(0);
buf.buffer.push(0);
buf.buffer.push(RegexOp::Jump as u8);
let jpos = buf.buffer.len();
buf.buffer.push(0);
buf.buffer.push(0);
store_number(&mut buf.buffer, ofjl_pos, (jpos + 2 - ofjl_pos - 2) as i16);
let jump_offset = laststart as i16 - (jpos as i16 + 2);
store_number(&mut buf.buffer, jpos, jump_offset);
} else {
let expr_bytes = buf.buffer[laststart..after_last].to_vec();
let body_may_be_empty = repeated_body_may_match_empty(&expr_bytes);
buf.buffer.truncate(laststart);
let expr_start = buf.buffer.len();
buf.buffer.extend_from_slice(&expr_bytes);
if body_may_be_empty {
buf.buffer.push(RegexOp::NoOp as u8);
}
buf.buffer.push(if body_may_be_empty {
RegexOp::OnFailureJumpNastyloop as u8
} else {
RegexOp::OnFailureJump as u8
});
let cond_arg_pos = buf.buffer.len();
buf.buffer.push(0);
buf.buffer.push(0);
let cond_offset = expr_start as i16 - (cond_arg_pos as i16 + 2);
store_number(&mut buf.buffer, cond_arg_pos, cond_offset);
}
}
b'?' => {
if greedy {
buf.buffer
.splice(laststart..laststart, [RegexOp::OnFailureJump as u8, 0, 0]);
let expr_len = after_last - laststart;
store_number(&mut buf.buffer, laststart + 1, expr_len as i16);
} else {
buf.buffer.splice(
laststart..laststart,
[RegexOp::OnFailureKeepStringJump as u8, 0, 0],
);
let expr_len = after_last - laststart;
store_number(&mut buf.buffer, laststart + 1, expr_len as i16);
}
}
_ => unreachable!(),
}
Ok(())
}
fn repeated_body_may_match_empty(body: &[u8]) -> bool {
let Some(&op) = body.first() else {
return true;
};
!matches!(
RegexOp::from_byte(op),
Some(
RegexOp::Exactn
| RegexOp::AnyChar
| RegexOp::Charset
| RegexOp::CharsetNot
| RegexOp::SyntaxSpec
| RegexOp::NotSyntaxSpec
| RegexOp::CategorySpec
| RegexOp::NotCategorySpec
)
)
}
fn decode_pattern_char(bytes: &[u8], pos: usize, multibyte: bool) -> Option<(u32, usize)> {
if pos >= bytes.len() {
return None;
}
if multibyte {
Some(emacs_char::string_char(&bytes[pos..]))
} else {
Some((bytes[pos] as u32, 1))
}
}
fn emacs_char_to_rust_char(code: u32) -> char {
if emacs_char::char_byte8_p(code) {
char::from(emacs_char::char_to_byte8(code))
} else {
char::from_u32(code).unwrap_or(char::REPLACEMENT_CHARACTER)
}
}
fn compile_charset(
pattern: &[u8],
p: &mut usize,
buf: &mut CompiledPattern,
case_fold: bool,
pattern_multibyte: bool,
) -> Result<(), RegexCompileError> {
let plen = pattern.len();
if *p >= plen {
return Err(RegexCompileError {
message: "Unmatched [ or [^".to_string(),
});
}
let negate = *p < plen && pattern[*p] == b'^';
if negate {
*p += 1;
if *p >= plen {
return Err(RegexCompileError {
message: "Unmatched [ or [^".to_string(),
});
}
}
let op = if negate {
RegexOp::CharsetNot
} else {
RegexOp::Charset
};
let charset_opcode_pos = buf.buffer.len();
buf.buffer.push(op as u8);
let _bitmap_len_pos = buf.buffer.len();
buf.buffer.push(32);
let bitmap_start = buf.buffer.len();
buf.buffer.extend_from_slice(&[0u8; 32]);
let mut mb_ranges: Vec<(char, char)> = Vec::new();
let mut class_bits: u8 = 0;
let mut first = true;
let mut last_char: Option<char> = None; let mut closed = false;
while *p < plen {
let b = pattern[*p];
let (c, clen) =
decode_pattern_char(pattern, *p, pattern_multibyte).unwrap_or((b as u32, 1));
*p += clen;
if b == b']' && !first {
closed = true;
break;
}
first = false;
if b == b'-' && *p < plen && pattern[*p] != b']' {
if let Some(range_start) = last_char {
let (range_end, rlen) = decode_pattern_char(pattern, *p, pattern_multibyte)
.unwrap_or((pattern[*p] as u32, 1));
let range_end = emacs_char_to_rust_char(range_end);
*p += rlen;
let translate = buf.translate.as_deref();
if range_start.is_ascii() && range_end.is_ascii() {
for ch in (range_start as u8)..=(range_end as u8) {
set_bitmap_bit(&mut buf.buffer, bitmap_start, ch, translate);
}
} else {
let start_u32 = range_start as u32;
let end_u32 = range_end as u32;
if start_u32 <= 127 {
let ascii_end = end_u32.min(127) as u8;
for ch in (start_u32 as u8)..=ascii_end {
set_bitmap_bit(&mut buf.buffer, bitmap_start, ch, translate);
}
}
let mb_start = if start_u32 >= 128 {
range_start
} else {
'\u{80}'
};
if end_u32 >= 128 {
add_multibyte_range(&mut mb_ranges, mb_start, range_end, case_fold);
}
}
last_char = None; continue;
}
set_bitmap_bit(
&mut buf.buffer,
bitmap_start,
b'-',
buf.translate.as_deref(),
);
last_char = Some('-');
continue;
}
if b == b'\\' {
set_bitmap_bit(
&mut buf.buffer,
bitmap_start,
b'\\',
buf.translate.as_deref(),
);
last_char = Some('\\');
continue;
}
if b == b'[' && *p < plen && pattern[*p] == b':' {
*p += 1; let class_start = *p;
while *p < plen && pattern[*p] != b':' {
*p += 1;
}
if *p < plen {
let class_name = std::str::from_utf8(&pattern[class_start..*p]).unwrap_or("");
*p += 1; if *p < plen && pattern[*p] == b']' {
*p += 1; }
apply_posix_class(
class_name,
&mut buf.buffer,
bitmap_start,
&mut mb_ranges,
&mut class_bits,
buf.translate.as_deref(),
)?;
last_char = None;
continue;
}
}
let c = emacs_char_to_rust_char(c);
if c.is_ascii() {
set_bitmap_bit(
&mut buf.buffer,
bitmap_start,
c as u8,
buf.translate.as_deref(),
);
} else {
add_multibyte_range(&mut mb_ranges, c, c, case_fold);
}
last_char = Some(c);
}
if !closed {
return Err(RegexCompileError {
message: "Unmatched [ or [^".to_string(),
});
}
if !mb_ranges.is_empty() {
buf.multibyte_charsets.insert(charset_opcode_pos, mb_ranges);
}
if class_bits != 0 {
buf.charset_class_bits
.insert(charset_opcode_pos, class_bits);
}
Ok(())
}
fn add_multibyte_range(ranges: &mut Vec<(char, char)>, start: char, end: char, case_fold: bool) {
ranges.push((start, end));
if case_fold {
if start == end {
for variant in start.to_lowercase() {
if variant != start {
ranges.push((variant, variant));
}
}
for variant in start.to_uppercase() {
if variant != start {
ranges.push((variant, variant));
}
}
}
}
}
fn set_bitmap_bit(buffer: &mut Vec<u8>, bitmap_start: usize, c: u8, translate: Option<&[char]>) {
let target = match translate {
Some(table) => table
.get(c as usize)
.map(|ch| *ch as u32 as u8)
.unwrap_or(c),
None => c,
};
let byte_idx = bitmap_start + (target as usize / 8);
let bit_idx = target as usize % 8;
if byte_idx < buffer.len() {
buffer[byte_idx] |= 1 << bit_idx;
}
}
fn apply_posix_class(
name: &str,
buffer: &mut Vec<u8>,
bitmap_start: usize,
mb_ranges: &mut Vec<(char, char)>,
class_bits: &mut u8,
translate: Option<&[char]>,
) -> Result<(), RegexCompileError> {
match name {
"word" => *class_bits |= CHARSET_CLASS_BIT_WORD,
"space" => *class_bits |= CHARSET_CLASS_BIT_SPACE,
_ => {}
}
let ascii_bytes: Vec<u8> = match name {
"alpha" => (b'A'..=b'Z').chain(b'a'..=b'z').collect(),
"digit" => (b'0'..=b'9').collect(),
"alnum" => (b'A'..=b'Z')
.chain(b'a'..=b'z')
.chain(b'0'..=b'9')
.collect(),
"space" => vec![b' ', b'\t', b'\n', b'\r', 0x0C],
"blank" => vec![b' ', b'\t'],
"upper" => (b'A'..=b'Z').collect(),
"lower" => (b'a'..=b'z').collect(),
"punct" => b"!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~".to_vec(),
"print" => (0x20u8..=0x7E).chain(0xA0u8..=0xFF).collect(),
"graph" => (0x21u8..=0x7E).chain(0xA1u8..=0xFF).collect(),
"cntrl" => (0x00u8..=0x1F).chain(std::iter::once(0x7F)).collect(),
"xdigit" => (b'0'..=b'9')
.chain(b'A'..=b'F')
.chain(b'a'..=b'f')
.collect(),
"ascii" => (0x00u8..=0x7F).collect(),
"word" => (b'A'..=b'Z')
.chain(b'a'..=b'z')
.chain(b'0'..=b'9')
.collect(),
"nonascii" => (0x80u8..=0xFF).collect(),
"unibyte" => (0x00u8..=0xFF).collect(),
"multibyte" => Vec::new(),
_ => {
return Err(RegexCompileError {
message: format!("Invalid character class name: {}", name),
});
}
};
for c in ascii_bytes {
set_bitmap_bit(buffer, bitmap_start, c, None);
if translate.is_some() {
set_bitmap_bit(buffer, bitmap_start, c, translate);
}
}
match name {
"nonascii" | "multibyte" => {
mb_ranges.push(('\u{80}', '\u{10FFFF}'));
}
_ => {}
}
Ok(())
}
fn parse_interval(
pattern: &[u8],
p: &mut usize,
) -> Result<(usize, Option<usize>), RegexCompileError> {
let plen = pattern.len();
let mut min = 0usize;
while *p < plen && pattern[*p].is_ascii_digit() {
min = min * 10 + (pattern[*p] - b'0') as usize;
*p += 1;
}
let max = if *p < plen && pattern[*p] == b',' {
*p += 1; if *p < plen && pattern[*p] == b'\\' && *p + 1 < plen && pattern[*p + 1] == b'}' {
None
} else {
let mut m = 0usize;
while *p < plen && pattern[*p].is_ascii_digit() {
m = m * 10 + (pattern[*p] - b'0') as usize;
*p += 1;
}
Some(m)
}
} else {
Some(min) };
if *p + 1 < plen && pattern[*p] == b'\\' && pattern[*p + 1] == b'}' {
*p += 2;
} else {
return Err(RegexCompileError {
message: "unterminated \\{".to_string(),
});
}
Ok((min, max))
}
fn compile_interval(
min: usize,
max: Option<usize>,
lazy: bool,
laststart: usize,
buf: &mut CompiledPattern,
) -> Result<(), RegexCompileError> {
let expr_bytes: Vec<u8> = buf.buffer[laststart..].to_vec();
buf.buffer.truncate(laststart);
for _ in 0..min {
buf.buffer.extend_from_slice(&expr_bytes);
}
match max {
Some(max_val) if max_val > min => {
if lazy {
for _ in 0..(max_val - min) {
buf.buffer.push(RegexOp::OnFailureKeepStringJump as u8);
let jpos = buf.buffer.len();
buf.buffer.push(0);
buf.buffer.push(0);
buf.buffer.extend_from_slice(&expr_bytes);
let target = (buf.buffer.len() - jpos - 2) as i16;
store_number(&mut buf.buffer, jpos, target);
}
} else {
for _ in 0..(max_val - min) {
buf.buffer.push(RegexOp::OnFailureJump as u8);
let jpos = buf.buffer.len();
buf.buffer.push(0);
buf.buffer.push(0);
buf.buffer.extend_from_slice(&expr_bytes);
let target = (buf.buffer.len() - jpos - 2) as i16;
store_number(&mut buf.buffer, jpos, target);
}
}
}
None => {
if lazy {
let loop_start = buf.buffer.len();
buf.buffer.push(RegexOp::OnFailureKeepStringJump as u8);
let jpos = buf.buffer.len();
buf.buffer.push(0);
buf.buffer.push(0);
buf.buffer.extend_from_slice(&expr_bytes);
buf.buffer.push(RegexOp::OnFailureJump as u8);
let jpos2 = buf.buffer.len();
buf.buffer.push(0);
buf.buffer.push(0);
let skip_target = (jpos2 + 2 - jpos - 2) as i16;
store_number(&mut buf.buffer, jpos, skip_target);
let ofj_target = loop_start as i16 - (jpos2 as i16 + 2);
store_number(&mut buf.buffer, jpos2, ofj_target);
} else {
let loop_start = buf.buffer.len();
buf.buffer.push(RegexOp::OnFailureJumpLoop as u8);
let jpos = buf.buffer.len();
buf.buffer.push(0);
buf.buffer.push(0);
buf.buffer.extend_from_slice(&expr_bytes);
buf.buffer.push(RegexOp::Jump as u8);
let jpos2 = buf.buffer.len();
buf.buffer.push(0);
buf.buffer.push(0);
let fail_target = (jpos2 + 2 - jpos - 2) as i16;
store_number(&mut buf.buffer, jpos, fail_target);
let jump_target = loop_start as i16 - (jpos2 as i16 + 2);
store_number(&mut buf.buffer, jpos2, jump_target);
}
}
_ => {} }
Ok(())
}
pub(crate) trait SyntaxLookup {
fn char_syntax(&self, c: char) -> SyntaxClass;
fn char_has_category(&self, c: char, cat: u8) -> bool;
}
pub(crate) struct DefaultSyntaxLookup;
pub(crate) struct BufferSyntaxLookup {
pub syntax_table: crate::emacs_core::syntax::SyntaxTable,
}
impl SyntaxLookup for DefaultSyntaxLookup {
fn char_syntax(&self, c: char) -> SyntaxClass {
if c.is_alphanumeric() || c == '_' {
SyntaxClass::Word
} else if c.is_whitespace() {
SyntaxClass::Whitespace
} else if c.is_ascii_punctuation() {
SyntaxClass::Punctuation
} else if matches!(c, '(' | '[' | '{') {
SyntaxClass::Open
} else if matches!(c, ')' | ']' | '}') {
SyntaxClass::Close
} else if c == '"' || c == '\'' {
SyntaxClass::StringDelim
} else {
SyntaxClass::Symbol
}
}
fn char_has_category(&self, c: char, cat: u8) -> bool {
default_char_has_category(c, cat)
}
}
impl SyntaxLookup for BufferSyntaxLookup {
fn char_syntax(&self, c: char) -> SyntaxClass {
self.syntax_table.char_syntax(c)
}
fn char_has_category(&self, c: char, cat: u8) -> bool {
default_char_has_category(c, cat)
}
}
fn default_char_has_category(c: char, cat: u8) -> bool {
let cp = c as u32;
match cat {
b'|' => !c.is_ascii(),
b'a' => c.is_ascii(),
b'A' => matches!(cp, 0xFF10..=0xFF19 | 0xFF21..=0xFF3A | 0xFF41..=0xFF5A),
b'l' | b'r' => {
c.is_ascii_alphabetic()
|| matches!(cp, 0x00C0..=0x00FF | 0x0100..=0x024F | 0x1E00..=0x1EFF)
}
b'g' => matches!(cp, 0x0370..=0x03FF | 0x1F00..=0x1FFF),
b'G' => matches!(cp, 0x0370..=0x03FF | 0x1F00..=0x1FFF),
b'y' | b'Y' => matches!(cp, 0x0400..=0x04FF | 0x0500..=0x052F),
b'b' => matches!(cp, 0x0600..=0x06FF | 0x0750..=0x077F | 0xFB50..=0xFDFF),
b'w' => matches!(cp, 0x0590..=0x05FF | 0xFB1D..=0xFB4F),
b't' => matches!(cp, 0x0E00..=0x0E7F),
b'o' => matches!(cp, 0x0E80..=0x0EFF),
b'q' => matches!(cp, 0x0F00..=0x0FFF),
b'i' => matches!(cp, 0x0900..=0x097F),
b'I' => matches!(cp, 0x0900..=0x0DFF),
b'e' => matches!(cp, 0x1200..=0x137F),
b'v' => matches!(cp, 0x1E00..=0x1EFF),
b'h' | b'N' => {
matches!(cp, 0x1100..=0x11FF | 0xAC00..=0xD7A3 | 0xA960..=0xA97F | 0xD7B0..=0xD7FF)
}
b'c' | b'C' => matches!(
cp,
0x3400..=0x4DBF
| 0x4E00..=0x9FFF
| 0xF900..=0xFAFF
| 0x20000..=0x2FFFF
| 0x30000..=0x323AF
),
b'H' => matches!(cp, 0x3040..=0x309F | 0x1B000..=0x1B16F),
b'K' => matches!(
cp,
0x3099..=0x309C | 0x30A0..=0x30FF | 0x31F0..=0x31FF | 0x1AFF0..=0x1B16F
),
b'k' => matches!(cp, 0x30A0..=0x30FF | 0x31F0..=0x31FF | 0xFF66..=0xFF9F),
b'j' => matches!(
cp,
0x3000..=0x303F
| 0x3040..=0x309F
| 0x30A0..=0x30FF
| 0xFF00..=0xFFEF
),
b'.' => match c.is_ascii() {
true => c.is_ascii_graphic() || c == ' ',
false => {
!matches!(cp, 0x0300..=0x036F | 0x1AB0..=0x1AFF | 0x1DC0..=0x1DFF | 0x20D0..=0x20FF | 0xFE20..=0xFE2F)
}
},
b'^' => {
matches!(cp, 0x0300..=0x036F | 0x1AB0..=0x1AFF | 0x1DC0..=0x1DFF | 0x20D0..=0x20FF | 0xFE20..=0xFE2F)
}
b'R' => matches!(cp, 0x0590..=0x05FF | 0x0600..=0x06FF | 0xFB1D..=0xFDFF | 0xFE70..=0xFEFF),
b'L' => {
!matches!(cp, 0x0590..=0x05FF | 0x0600..=0x06FF | 0xFB1D..=0xFDFF | 0xFE70..=0xFEFF)
}
b'6' => c.is_numeric(),
_ => false,
}
}
pub(crate) fn re_match(
pattern: &CompiledPattern,
text: &[u8],
pos: usize,
stop: usize,
syntax: &dyn SyntaxLookup,
point: usize,
) -> Option<(usize, MatchRegisters)> {
let bytecode = &pattern.buffer;
let num_regs = pattern.re_nsub + 1;
let mut fail_stack: Vec<FailurePoint> = Vec::new();
let mut counters: HashMap<usize, i16> = HashMap::new();
let has_subexpressions = pattern.re_nsub > 0;
let mut regstart: RegisterScratch = if has_subexpressions {
register_scratch(num_regs)
} else {
RegisterScratch::new()
};
let mut regend: RegisterScratch = if has_subexpressions {
register_scratch(num_regs)
} else {
RegisterScratch::new()
};
let posix_longest = pattern.posix;
let mut best_regs_set = false;
let mut best_match_end: usize = pos;
let mut best_regstart: RegisterScratch = if has_subexpressions {
register_scratch(num_regs)
} else {
RegisterScratch::new()
};
let mut best_regend: RegisterScratch = if has_subexpressions {
register_scratch(num_regs)
} else {
RegisterScratch::new()
};
let mut pc = 0usize; let mut d = pos;
let translate = &pattern.translate;
let pattern_multibyte = pattern.multibyte;
let target_multibyte = pattern.target_multibyte;
let tr = |c: u32| -> u32 {
if let Some(table) = translate {
if c < 256 { table[c as usize] as u32 } else { c }
} else {
c
}
};
let text_byte = |pos: usize| -> Option<u8> {
if pos < text.len() {
Some(text[pos])
} else {
None
}
};
let unibyte_to_emacs_char = |byte: u8| -> u32 {
if byte < 0x80 {
byte as u32
} else {
emacs_char::byte8_to_char(byte)
}
};
let syntax_char = |code: u32| -> char {
if emacs_char::char_byte8_p(code) {
char::from(emacs_char::char_to_byte8(code))
} else {
char::from_u32(code).unwrap_or(char::REPLACEMENT_CHARACTER)
}
};
let emacs_char_to_unibyte = |code: u32| -> Option<u8> {
if code < 0x80 || emacs_char::char_byte8_p(code) {
Some(emacs_char::char_to_byte8(code))
} else {
None
}
};
let text_char = |pos: usize| -> Option<(u32, usize)> {
if pos >= text.len() {
return None;
}
if target_multibyte {
Some(emacs_char::string_char(&text[pos..]))
} else {
Some((unibyte_to_emacs_char(text[pos]), 1))
}
};
let prev_char_start = |pos: usize| -> Option<usize> {
if pos == 0 {
return None;
}
if !target_multibyte {
return Some(pos - 1);
}
let mut p = pos - 1;
while p > 0 && (text[p] & 0xC0) == 0x80 {
p -= 1;
}
Some(p)
};
let at_word_boundary = |pos: usize| -> bool {
let prev_word = if let Some(prev) = prev_char_start(pos) {
text_char(prev)
.map(|(c, _)| syntax.char_syntax(syntax_char(c)) == SyntaxClass::Word)
.unwrap_or(false)
} else {
false
};
let curr_word = text_char(pos)
.map(|(c, _)| syntax.char_syntax(syntax_char(c)) == SyntaxClass::Word)
.unwrap_or(false);
prev_word != curr_word
};
let at_symbol_boundary = |pos: usize| -> bool {
let is_symbol_char = |c: u32| {
let syn = syntax.char_syntax(syntax_char(c));
syn == SyntaxClass::Word || syn == SyntaxClass::Symbol
};
let prev_sym = if let Some(prev) = prev_char_start(pos) {
text_char(prev)
.map(|(c, _)| is_symbol_char(c))
.unwrap_or(false)
} else {
false
};
let curr_sym = text_char(pos)
.map(|(c, _)| is_symbol_char(c))
.unwrap_or(false);
prev_sym != curr_sym
};
let mut total_failure = false;
macro_rules! try_fail {
($label:lifetime) => {
if goto_fail(
&mut pc,
&mut d,
&mut fail_stack,
&mut regstart,
&mut regend,
&mut counters,
)
.is_none()
{
total_failure = true;
break $label;
}
};
}
'main_loop: loop {
if pc >= bytecode.len() {
if posix_longest && d < stop {
let better_than_best = !best_regs_set || d > best_match_end;
if !fail_stack.is_empty() {
if better_than_best {
best_regs_set = true;
best_match_end = d;
best_regstart.clone_from_slice(®start);
best_regend.clone_from_slice(®end);
}
try_fail!('main_loop);
continue 'main_loop;
} else if best_regs_set && !better_than_best {
d = best_match_end;
for i in 1..num_regs {
regstart[i] = best_regstart[i];
regend[i] = best_regend[i];
}
}
}
break 'main_loop;
}
let op_byte = bytecode[pc];
let Some(op) = RegexOp::from_byte(op_byte) else {
return None;
};
pc += 1;
match op {
RegexOp::NoOp => {
}
RegexOp::Succeed => {
break 'main_loop;
}
RegexOp::Exactn => {
let count = bytecode[pc] as usize;
pc += 1;
let mut matched = true;
let literal_start = pc;
let literal_end = literal_start + count;
let mut pat_pos = literal_start;
while pat_pos < literal_end {
if d >= stop {
matched = false;
break;
}
let Some((buf_ch, buf_len)) = text_char(d) else {
matched = false;
break;
};
if target_multibyte {
let (pat_ch, pat_len) = if pattern_multibyte {
emacs_char::string_char(&bytecode[pat_pos..literal_end])
} else {
(unibyte_to_emacs_char(bytecode[pat_pos]), 1)
};
if tr(buf_ch) != pat_ch {
matched = false;
break;
}
pat_pos += pat_len;
d += buf_len;
} else {
let pat_byte = if pattern_multibyte {
let (pat_ch, pat_len) =
emacs_char::string_char(&bytecode[pat_pos..literal_end]);
let Some(byte) = emacs_char_to_unibyte(pat_ch) else {
matched = false;
break;
};
pat_pos += pat_len;
byte
} else {
let byte = bytecode[pat_pos];
pat_pos += 1;
byte
};
let buf_byte = text[d];
let mut translated = unibyte_to_emacs_char(buf_byte);
if !emacs_char::char_byte8_p(translated) {
translated = tr(translated);
if let Some(byte) = emacs_char_to_unibyte(translated) {
translated = byte as u32;
} else {
translated = buf_byte as u32;
}
} else {
translated = buf_byte as u32;
}
if translated as u8 != pat_byte {
matched = false;
break;
}
d += 1;
}
}
pc = literal_end;
if !matched {
try_fail!('main_loop);
}
}
RegexOp::AnyChar => {
if d >= stop {
try_fail!('main_loop);
continue;
}
let Some((buf_ch, buf_len)) = text_char(d) else {
try_fail!('main_loop);
continue;
};
if tr(buf_ch) == '\n' as u32 {
try_fail!('main_loop);
continue;
}
d += buf_len;
}
RegexOp::Charset | RegexOp::CharsetNot => {
let negate = op == RegexOp::CharsetNot;
let charset_op_pos = pc - 1; let bitmap_len = bytecode[pc] as usize & 0x7F;
pc += 1;
if d >= stop {
pc += bitmap_len;
try_fail!('main_loop);
continue;
}
let Some((orig_ch, ch_len)) = text_char(d) else {
pc += bitmap_len;
try_fail!('main_loop);
continue;
};
let mut ch = orig_ch;
let mut unibyte_char = false;
if target_multibyte {
ch = tr(ch);
if let Some(byte) = emacs_char_to_unibyte(ch) {
unibyte_char = true;
ch = byte as u32;
}
} else {
let mut converted = unibyte_to_emacs_char(text[d]);
if !emacs_char::char_byte8_p(converted) {
converted = tr(converted);
if let Some(byte) = emacs_char_to_unibyte(converted) {
unibyte_char = true;
ch = byte as u32;
}
} else {
unibyte_char = true;
ch = text[d] as u32;
}
}
let in_set = if unibyte_char {
let c = ch as usize;
if (c / 8) < bitmap_len {
let byte = bytecode[pc + c / 8];
(byte >> (c % 8)) & 1 != 0
} else {
false
}
} else if let Some(ranges) = pattern.multibyte_charsets.get(&charset_op_pos) {
let ch = syntax_char(ch);
ranges.iter().any(|&(lo, hi)| ch >= lo && ch <= hi)
} else {
false
};
let in_set = in_set
|| pattern
.charset_class_bits
.get(&charset_op_pos)
.copied()
.map(|bits| {
(bits & CHARSET_CLASS_BIT_WORD != 0
&& syntax.char_syntax(syntax_char(orig_ch)) == SyntaxClass::Word)
|| (bits & CHARSET_CLASS_BIT_SPACE != 0
&& syntax.char_syntax(syntax_char(orig_ch))
== SyntaxClass::Whitespace)
})
.unwrap_or(false);
let matched = if negate { !in_set } else { in_set };
pc += bitmap_len;
if !matched {
try_fail!('main_loop);
continue;
}
d += ch_len;
}
RegexOp::StartMemory => {
let group = bytecode[pc] as usize;
pc += 1;
if group < num_regs
&& let Some(start) = regstart.get_mut(group)
{
*start = Some(d);
}
}
RegexOp::StopMemory => {
let group = bytecode[pc] as usize;
pc += 1;
if group < num_regs
&& let Some(end) = regend.get_mut(group)
{
*end = Some(d);
}
}
RegexOp::Duplicate => {
let group = bytecode[pc] as usize;
pc += 1;
let Some(start) = regstart.get(group).copied().flatten() else {
try_fail!('main_loop);
continue;
};
let Some(end) = regend.get(group).copied().flatten() else {
try_fail!('main_loop);
continue;
};
let ref_len = end - start;
if d + ref_len > stop {
try_fail!('main_loop);
continue;
}
let mut matched = true;
for i in 0..ref_len {
if tr(text[d + i].into()) != tr(text[start + i].into()) {
matched = false;
break;
}
}
if !matched {
try_fail!('main_loop);
continue;
}
d += ref_len;
}
RegexOp::BegLine => {
if d == 0 || (d > 0 && text[d - 1] == b'\n') {
} else {
try_fail!('main_loop);
}
}
RegexOp::EndLine => {
if d >= stop || text[d] == b'\n' {
} else {
try_fail!('main_loop);
}
}
RegexOp::BegBuf => {
if d != 0 {
try_fail!('main_loop);
}
}
RegexOp::EndBuf => {
if d != stop {
try_fail!('main_loop);
}
}
RegexOp::AtDot => {
if d != point {
try_fail!('main_loop);
}
}
RegexOp::WordBound => {
if !at_word_boundary(d) {
try_fail!('main_loop);
}
}
RegexOp::NotWordBound => {
if at_word_boundary(d) {
try_fail!('main_loop);
}
}
RegexOp::WordBeg => {
let prev_word = prev_char_start(d)
.and_then(|p| text_char(p))
.map(|(c, _)| syntax.char_syntax(syntax_char(c)) == SyntaxClass::Word)
.unwrap_or(false);
let curr_word = text_char(d)
.map(|(c, _)| syntax.char_syntax(syntax_char(c)) == SyntaxClass::Word)
.unwrap_or(false);
if prev_word || !curr_word {
try_fail!('main_loop);
}
}
RegexOp::WordEnd => {
let prev_word = prev_char_start(d)
.and_then(|p| text_char(p))
.map(|(c, _)| syntax.char_syntax(syntax_char(c)) == SyntaxClass::Word)
.unwrap_or(false);
let curr_word = text_char(d)
.map(|(c, _)| syntax.char_syntax(syntax_char(c)) == SyntaxClass::Word)
.unwrap_or(false);
if !prev_word || curr_word {
try_fail!('main_loop);
}
}
RegexOp::SymBeg => {
let is_sym = |c: u32| {
let s = syntax.char_syntax(syntax_char(c));
s == SyntaxClass::Word || s == SyntaxClass::Symbol
};
let prev_sym = prev_char_start(d)
.and_then(|p| text_char(p))
.map(|(c, _)| is_sym(c))
.unwrap_or(false);
let curr_sym = text_char(d).map(|(c, _)| is_sym(c)).unwrap_or(false);
if prev_sym || !curr_sym {
try_fail!('main_loop);
}
}
RegexOp::SymEnd => {
let is_sym = |c: u32| {
let s = syntax.char_syntax(syntax_char(c));
s == SyntaxClass::Word || s == SyntaxClass::Symbol
};
let prev_sym = prev_char_start(d)
.and_then(|p| text_char(p))
.map(|(c, _)| is_sym(c))
.unwrap_or(false);
let curr_sym = text_char(d).map(|(c, _)| is_sym(c)).unwrap_or(false);
if !prev_sym || curr_sym {
try_fail!('main_loop);
}
}
RegexOp::SyntaxSpec => {
let class_byte = bytecode[pc];
pc += 1;
if d >= stop {
try_fail!('main_loop);
continue;
}
let Some((c, len)) = text_char(d) else {
try_fail!('main_loop);
continue;
};
if syntax.char_syntax(syntax_char(c)) as u8 != class_byte {
try_fail!('main_loop);
continue;
}
d += len;
}
RegexOp::NotSyntaxSpec => {
let class_byte = bytecode[pc];
pc += 1;
if d >= stop {
try_fail!('main_loop);
continue;
}
let Some((c, len)) = text_char(d) else {
try_fail!('main_loop);
continue;
};
if syntax.char_syntax(syntax_char(c)) as u8 == class_byte {
try_fail!('main_loop);
continue;
}
d += len;
}
RegexOp::CategorySpec => {
let cat = bytecode[pc];
pc += 1;
if d >= stop {
try_fail!('main_loop);
continue;
}
let Some((c, len)) = text_char(d) else {
try_fail!('main_loop);
continue;
};
if !syntax.char_has_category(syntax_char(c), cat) {
try_fail!('main_loop);
continue;
}
d += len;
}
RegexOp::NotCategorySpec => {
let cat = bytecode[pc];
pc += 1;
if d >= stop {
try_fail!('main_loop);
continue;
}
let Some((c, len)) = text_char(d) else {
try_fail!('main_loop);
continue;
};
if syntax.char_has_category(syntax_char(c), cat) {
try_fail!('main_loop);
continue;
}
d += len;
}
RegexOp::Jump => {
if crate::emacs_core::eval::tls_quit_pending() {
return None;
}
let offset = extract_number(bytecode, pc);
pc = ((pc as i64) + 2 + (offset as i64)) as usize;
}
RegexOp::OnFailureJump => {
let offset = extract_number(bytecode, pc);
pc += 2;
let fail_pc = ((pc as i64) + (offset as i64)) as usize;
fail_stack.push(FailurePoint {
pattern_pos: fail_pc,
string_pos: Some(d),
saved_registers: save_registers(®start, ®end, num_regs),
saved_counters: counters.clone(),
});
}
RegexOp::OnFailureKeepStringJump => {
let offset = extract_number(bytecode, pc);
pc += 2;
let fail_pc = ((pc as i64) + (offset as i64)) as usize;
fail_stack.push(FailurePoint {
pattern_pos: fail_pc,
string_pos: None, saved_registers: save_registers(®start, ®end, num_regs),
saved_counters: counters.clone(),
});
}
RegexOp::OnFailureJumpLoop => {
let offset = extract_number(bytecode, pc);
pc += 2;
let fail_pc = ((pc as i64) + (offset as i64)) as usize;
let already_at_same_pos = fail_stack
.last()
.is_some_and(|fp| fp.string_pos == Some(d) && fp.pattern_pos == fail_pc);
if already_at_same_pos {
pc = fail_pc;
} else {
fail_stack.push(FailurePoint {
pattern_pos: fail_pc,
string_pos: Some(d),
saved_registers: save_registers(®start, ®end, num_regs),
saved_counters: counters.clone(),
});
}
}
RegexOp::OnFailureJumpNastyloop => {
let offset = extract_number(bytecode, pc);
pc += 2;
let fail_pc = ((pc as i64) + (offset as i64)) as usize;
fail_stack.push(FailurePoint {
pattern_pos: fail_pc,
string_pos: Some(d),
saved_registers: save_registers(®start, ®end, num_regs),
saved_counters: counters.clone(),
});
}
RegexOp::OnFailureJumpSmart => {
let offset = extract_number(bytecode, pc);
pc += 2;
let fail_pc = ((pc as i64) + (offset as i64)) as usize;
fail_stack.push(FailurePoint {
pattern_pos: fail_pc,
string_pos: Some(d),
saved_registers: save_registers(®start, ®end, num_regs),
saved_counters: counters.clone(),
});
}
RegexOp::SucceedN => {
let counter_pos = pc + 2; let count = get_counter(&counters, bytecode, counter_pos);
if count != 0 {
set_counter(&mut counters, counter_pos, count - 1);
pc += 4;
} else {
let offset = extract_number(bytecode, pc);
pc += 2; let fail_pc = ((pc as i64) + (offset as i64)) as usize;
pc += 2; let already_at_same_pos = fail_stack
.last()
.is_some_and(|fp| fp.string_pos == Some(d) && fp.pattern_pos == fail_pc);
if already_at_same_pos {
pc = fail_pc;
} else {
fail_stack.push(FailurePoint {
pattern_pos: fail_pc,
string_pos: Some(d),
saved_registers: save_registers(®start, ®end, num_regs),
saved_counters: counters.clone(),
});
}
}
}
RegexOp::JumpN => {
let counter_pos = pc + 2;
let count = get_counter(&counters, bytecode, counter_pos);
if count != 0 {
set_counter(&mut counters, counter_pos, count - 1);
let offset = extract_number(bytecode, pc);
pc = ((pc as i64) + 2 + (offset as i64)) as usize;
} else {
pc += 4; }
}
RegexOp::SetNumberAt => {
let rel_offset = extract_number(bytecode, pc);
pc += 2; let value = extract_number(bytecode, pc);
pc += 2; let target_pos = ((pc as i64) - 2 + (rel_offset as i64)) as usize;
set_counter(&mut counters, target_pos, value);
}
}
}
if total_failure {
if best_regs_set {
d = best_match_end;
for i in 1..num_regs {
regstart[i] = best_regstart[i];
regend[i] = best_regend[i];
}
} else {
return None;
}
}
let mut regs = MatchRegisters::new(num_regs);
regs.start[0] = pos as i64;
regs.end[0] = d as i64;
for i in 1..num_regs {
regs.start[i] = regstart
.get(i)
.copied()
.flatten()
.map(|v| v as i64)
.unwrap_or(-1);
regs.end[i] = regend
.get(i)
.copied()
.flatten()
.map(|v| v as i64)
.unwrap_or(-1);
}
Some((d, regs))
}
fn save_registers(
regstart: &[Option<usize>],
regend: &[Option<usize>],
num_regs: usize,
) -> SavedRegisters {
let mut saved = SavedRegisters::new();
for i in 1..num_regs.min(regstart.len()).min(regend.len()) {
saved.push((
i,
regstart[i].map(|v| v as i64).unwrap_or(-1),
regend[i].map(|v| v as i64).unwrap_or(-1),
));
}
saved
}
fn restore_registers(
fp: &FailurePoint,
regstart: &mut [Option<usize>],
regend: &mut [Option<usize>],
) {
for &(idx, start, end) in &fp.saved_registers {
if idx < regstart.len() {
regstart[idx] = if start >= 0 {
Some(start as usize)
} else {
None
};
}
if idx < regend.len() {
regend[idx] = if end >= 0 { Some(end as usize) } else { None };
}
}
}
fn goto_fail(
pc: &mut usize,
d: &mut usize,
fail_stack: &mut Vec<FailurePoint>,
regstart: &mut [Option<usize>],
regend: &mut [Option<usize>],
counters: &mut HashMap<usize, i16>,
) -> Option<()> {
if crate::emacs_core::eval::tls_quit_pending() {
return None;
}
let fp = fail_stack.pop()?;
*pc = fp.pattern_pos;
if let Some(sp) = fp.string_pos {
*d = sp;
}
restore_registers(&fp, regstart, regend);
*counters = fp.saved_counters;
Some(())
}
fn register_scratch(num_regs: usize) -> RegisterScratch {
let mut scratch = RegisterScratch::new();
scratch.resize(num_regs, None);
scratch
}
fn fastmap_for_syntax_class(fastmap: &mut [bool; 256], class_byte: u8, negate: bool) {
let target = match crate::emacs_core::syntax::SyntaxClass::from_code(class_byte as i64) {
Some(cls) => cls,
None => {
*fastmap = [true; 256];
return;
}
};
let table = crate::emacs_core::syntax::SyntaxTable::new_standard();
for c in 0u8..=127 {
let in_class = table.char_syntax(c as char) == target;
if in_class != negate {
fastmap[c as usize] = true;
}
}
for c in 128..256usize {
fastmap[c] = true;
}
}
fn compile_fastmap(pattern: &mut CompiledPattern) {
pattern.fastmap = [false; 256];
pattern.can_be_null = false;
let bytecode = &pattern.buffer;
if bytecode.is_empty() {
pattern.can_be_null = true;
pattern.fastmap_accurate = true;
return;
}
let case_fold = pattern.translate.is_some();
let mut worklist: Vec<usize> = vec![0];
let mut visited: HashSet<usize> = HashSet::new();
while let Some(pc) = worklist.pop() {
let mut pc = pc;
loop {
if !visited.insert(pc) {
break;
}
if pc >= bytecode.len() {
pattern.can_be_null = true;
break;
}
let Some(op) = RegexOp::from_byte(bytecode[pc]) else {
break;
};
pc += 1;
match op {
RegexOp::Succeed => {
pattern.can_be_null = true;
break;
}
RegexOp::Exactn => {
if pc >= bytecode.len() {
break;
}
let count = bytecode[pc] as usize;
pc += 1;
if count == 0 || pc >= bytecode.len() {
break;
}
let first = bytecode[pc];
pattern.fastmap[first as usize] = true;
if case_fold {
let upper = (first as char)
.to_uppercase()
.next()
.unwrap_or(first as char) as u8;
let lower = (first as char)
.to_lowercase()
.next()
.unwrap_or(first as char) as u8;
pattern.fastmap[upper as usize] = true;
pattern.fastmap[lower as usize] = true;
}
break; }
RegexOp::AnyChar => {
for c in 0..256usize {
if c != b'\n' as usize {
pattern.fastmap[c] = true;
}
}
break;
}
RegexOp::Charset => {
if pc >= bytecode.len() {
break;
}
let charset_op_pos = pc - 1;
let bitmap_len = bytecode[pc] as usize & 0x7F;
pc += 1;
for c in 0..256usize {
if c / 8 < bitmap_len && pc + c / 8 < bytecode.len() {
if (bytecode[pc + c / 8] >> (c % 8)) & 1 != 0 {
pattern.fastmap[c] = true;
}
}
}
if pattern.multibyte_charsets.contains_key(&charset_op_pos) {
for c in 128..256usize {
pattern.fastmap[c] = true;
}
}
break;
}
RegexOp::CharsetNot => {
if pc >= bytecode.len() {
break;
}
let charset_op_pos = pc - 1;
let bitmap_len = bytecode[pc] as usize & 0x7F;
pc += 1;
for c in 0..256usize {
let in_set = if c / 8 < bitmap_len && pc + c / 8 < bytecode.len() {
(bytecode[pc + c / 8] >> (c % 8)) & 1 != 0
} else {
false
};
if !in_set {
pattern.fastmap[c] = true;
}
}
break;
}
RegexOp::SyntaxSpec => {
if pc >= bytecode.len() {
break;
}
fastmap_for_syntax_class(&mut pattern.fastmap, bytecode[pc], false);
break;
}
RegexOp::NotSyntaxSpec => {
if pc >= bytecode.len() {
break;
}
fastmap_for_syntax_class(&mut pattern.fastmap, bytecode[pc], true);
break;
}
RegexOp::CategorySpec | RegexOp::NotCategorySpec => {
pattern.fastmap = [true; 256];
break;
}
RegexOp::BegLine
| RegexOp::EndLine
| RegexOp::BegBuf
| RegexOp::EndBuf
| RegexOp::AtDot
| RegexOp::WordBound
| RegexOp::NotWordBound
| RegexOp::WordBeg
| RegexOp::WordEnd
| RegexOp::SymBeg
| RegexOp::SymEnd => {
}
RegexOp::StartMemory | RegexOp::StopMemory => {
pc += 1;
}
RegexOp::Duplicate => {
pattern.fastmap = [true; 256];
break;
}
RegexOp::NoOp => {
}
RegexOp::Jump => {
if pc + 1 >= bytecode.len() {
break;
}
let offset = extract_number(bytecode, pc);
pc = ((pc as i64) + 2 + (offset as i64)) as usize;
}
RegexOp::OnFailureJump
| RegexOp::OnFailureKeepStringJump
| RegexOp::OnFailureJumpLoop
| RegexOp::OnFailureJumpNastyloop
| RegexOp::OnFailureJumpSmart => {
if pc + 1 >= bytecode.len() {
break;
}
let offset = extract_number(bytecode, pc);
pc += 2;
let target = ((pc as i64) + (offset as i64)) as usize;
worklist.push(target);
}
RegexOp::SucceedN => {
if pc + 3 >= bytecode.len() {
break;
}
let offset = extract_number(bytecode, pc);
let target = ((pc as i64) + 2 + (offset as i64)) as usize;
worklist.push(target);
pc += 4; }
RegexOp::JumpN => {
if pc + 3 >= bytecode.len() {
break;
}
let offset = extract_number(bytecode, pc);
let target = ((pc as i64) + 2 + (offset as i64)) as usize;
worklist.push(target);
pc += 4; }
RegexOp::SetNumberAt => {
pc += 4;
}
}
}
}
pattern.fastmap_accurate = true;
}
pub(crate) fn re_search(
pattern: &CompiledPattern,
text: &[u8],
start: usize,
range: isize,
syntax: &dyn SyntaxLookup,
point: usize,
) -> Option<(usize, MatchRegisters)> {
let text_len = text.len();
let fastmap_tr = |b: u8| -> u8 {
match &pattern.translate {
Some(table) => table
.get(b as usize)
.copied()
.map(|ch| ch as u32 as u8)
.unwrap_or(b),
None => b,
}
};
if range >= 0 {
let end = (start + range as usize).min(text_len);
let mut pos = start;
while pos <= end {
if pos > text_len {
break;
}
if pattern.target_multibyte && pos < text_len && (text[pos] & 0xC0) == 0x80 {
pos += 1;
continue;
}
if pattern.fastmap_accurate && !pattern.can_be_null && pos < text_len {
if !pattern.fastmap[fastmap_tr(text[pos]) as usize] {
pos += 1;
continue;
}
}
if let Some(result) = re_match(pattern, text, pos, text_len, syntax, point) {
return Some((pos, result.1));
}
pos += 1;
}
} else {
let end = start.saturating_sub((-range) as usize);
for pos in (end..=start).rev() {
if pattern.target_multibyte && pos < text_len && (text[pos] & 0xC0) == 0x80 {
continue;
}
if pattern.fastmap_accurate && !pattern.can_be_null && pos < text_len {
if !pattern.fastmap[fastmap_tr(text[pos]) as usize] {
continue;
}
}
if let Some(result) = re_match(pattern, text, pos, start, syntax, point) {
return Some((pos, result.1));
}
}
}
None
}
pub(crate) fn search_pattern(
pattern_str: &str,
text: &str,
start: usize,
case_fold: bool,
syntax: &dyn SyntaxLookup,
point: usize,
) -> Result<Option<(usize, MatchRegisters)>, RegexCompileError> {
let compiled = regex_compile(pattern_str, false, case_fold)?;
Ok(re_search(
&compiled,
text.as_bytes(),
start,
(text.len() - start) as isize,
syntax,
point,
))
}
pub(crate) fn match_pattern(
pattern_str: &str,
text: &str,
pos: usize,
case_fold: bool,
syntax: &dyn SyntaxLookup,
point: usize,
) -> Result<Option<(usize, MatchRegisters)>, RegexCompileError> {
let compiled = regex_compile(pattern_str, false, case_fold)?;
Ok(re_match(
&compiled,
text.as_bytes(),
pos,
text.len(),
syntax,
point,
))
}
#[cfg(test)]
#[path = "regex_emacs_test.rs"]
mod tests;