use crate::hir::{Hir, HirExpr};
pub const MAX_POSITIONS: usize = 64;
pub const MAX_POSITIONS_WIDE: usize = 256;
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
pub struct BitSet256 {
pub parts: [u64; 4],
}
impl BitSet256 {
#[inline]
pub const fn empty() -> Self {
Self { parts: [0; 4] }
}
#[inline]
pub const fn all_ones() -> Self {
Self { parts: [!0u64; 4] }
}
#[inline]
pub fn singleton(pos: usize) -> Self {
let mut set = Self::empty();
set.set(pos);
set
}
#[inline]
pub fn set(&mut self, pos: usize) {
let word = pos / 64;
let bit = pos % 64;
if word < 4 {
self.parts[word] |= 1u64 << bit;
}
}
#[inline]
pub fn clear(&mut self, pos: usize) {
let word = pos / 64;
let bit = pos % 64;
if word < 4 {
self.parts[word] &= !(1u64 << bit);
}
}
#[inline]
pub fn get(&self, pos: usize) -> bool {
let word = pos / 64;
let bit = pos % 64;
if word < 4 {
(self.parts[word] >> bit) & 1 != 0
} else {
false
}
}
#[inline]
pub fn is_empty(&self) -> bool {
self.parts[0] == 0 && self.parts[1] == 0 && self.parts[2] == 0 && self.parts[3] == 0
}
#[inline]
pub fn is_all_ones(&self) -> bool {
self.parts[0] == !0u64
&& self.parts[1] == !0u64
&& self.parts[2] == !0u64
&& self.parts[3] == !0u64
}
#[inline]
pub fn union(self, other: Self) -> Self {
Self {
parts: [
self.parts[0] | other.parts[0],
self.parts[1] | other.parts[1],
self.parts[2] | other.parts[2],
self.parts[3] | other.parts[3],
],
}
}
#[inline]
pub fn intersection(self, other: Self) -> Self {
Self {
parts: [
self.parts[0] & other.parts[0],
self.parts[1] & other.parts[1],
self.parts[2] & other.parts[2],
self.parts[3] & other.parts[3],
],
}
}
#[inline]
pub fn complement(self) -> Self {
Self {
parts: [
!self.parts[0],
!self.parts[1],
!self.parts[2],
!self.parts[3],
],
}
}
#[inline]
pub fn union_assign(&mut self, other: Self) {
self.parts[0] |= other.parts[0];
self.parts[1] |= other.parts[1];
self.parts[2] |= other.parts[2];
self.parts[3] |= other.parts[3];
}
#[inline]
pub fn iter_ones(&self) -> BitSet256Iter {
BitSet256Iter {
set: *self,
word_idx: 0,
}
}
}
pub struct BitSet256Iter {
set: BitSet256,
word_idx: usize,
}
impl Iterator for BitSet256Iter {
type Item = usize;
#[inline]
fn next(&mut self) -> Option<Self::Item> {
while self.word_idx < 4 {
let word = self.set.parts[self.word_idx];
if word != 0 {
let bit_idx = word.trailing_zeros() as usize;
let pos = self.word_idx * 64 + bit_idx;
self.set.parts[self.word_idx] &= word - 1;
return Some(pos);
}
self.word_idx += 1;
}
None
}
}
#[derive(Debug, Clone)]
pub struct GlushkovNfa {
pub positions: Vec<ByteSet>,
pub follow: Vec<u64>,
pub first: u64,
pub last: u64,
pub nullable: bool,
pub position_count: usize,
}
#[derive(Debug, Clone, Default)]
pub struct ByteSet {
bits: [u64; 4],
}
impl ByteSet {
pub fn new() -> Self {
Self { bits: [0; 4] }
}
pub fn singleton(byte: u8) -> Self {
let mut set = Self::new();
set.insert(byte);
set
}
pub fn from_range(start: u8, end: u8) -> Self {
let mut set = Self::new();
for b in start..=end {
set.insert(b);
}
set
}
pub fn all() -> Self {
Self { bits: [!0u64; 4] }
}
pub fn insert(&mut self, byte: u8) {
let idx = byte as usize / 64;
let bit = byte as usize % 64;
self.bits[idx] |= 1u64 << bit;
}
pub fn contains(&self, byte: u8) -> bool {
let idx = byte as usize / 64;
let bit = byte as usize % 64;
(self.bits[idx] >> bit) & 1 != 0
}
pub fn is_empty(&self) -> bool {
self.bits.iter().all(|&b| b == 0)
}
pub fn union(&self, other: &ByteSet) -> ByteSet {
ByteSet {
bits: [
self.bits[0] | other.bits[0],
self.bits[1] | other.bits[1],
self.bits[2] | other.bits[2],
self.bits[3] | other.bits[3],
],
}
}
pub fn complement(&self) -> ByteSet {
ByteSet {
bits: [!self.bits[0], !self.bits[1], !self.bits[2], !self.bits[3]],
}
}
}
#[derive(Debug, Clone)]
struct ExprInfo {
first: u64,
last: u64,
nullable: bool,
}
impl ExprInfo {
fn empty() -> Self {
Self {
first: 0,
last: 0,
nullable: true,
}
}
}
pub struct GlushkovBuilder {
positions: Vec<ByteSet>,
follow: Vec<u64>,
}
impl GlushkovBuilder {
pub fn new() -> Self {
Self {
positions: Vec::new(),
follow: Vec::new(),
}
}
pub fn build(&mut self, hir: &Hir) -> Option<GlushkovNfa> {
if hir.props.has_backrefs || hir.props.has_lookaround {
return None;
}
self.positions.clear();
self.follow.clear();
let info = self.build_expr(&hir.expr)?;
if self.positions.len() > MAX_POSITIONS {
return None;
}
Some(GlushkovNfa {
positions: self.positions.clone(),
follow: self.follow.clone(),
first: info.first,
last: info.last,
nullable: info.nullable,
position_count: self.positions.len(),
})
}
fn build_expr(&mut self, expr: &HirExpr) -> Option<ExprInfo> {
match expr {
HirExpr::Empty => Some(ExprInfo::empty()),
HirExpr::Literal(bytes) => {
if bytes.is_empty() {
return Some(ExprInfo::empty());
}
let first_pos = self.positions.len();
for &b in bytes {
if self.positions.len() >= MAX_POSITIONS {
return None;
}
self.positions.push(ByteSet::singleton(b));
self.follow.push(0); }
let last_pos = self.positions.len() - 1;
for i in first_pos..last_pos {
self.follow[i] |= 1u64 << (i + 1);
}
Some(ExprInfo {
first: 1u64 << first_pos,
last: 1u64 << last_pos,
nullable: false,
})
}
HirExpr::Class(class) => {
if self.positions.len() >= MAX_POSITIONS {
return None;
}
let pos = self.positions.len();
let mut byte_set = ByteSet::new();
for &(start, end) in &class.ranges {
for b in start..=end {
byte_set.insert(b);
}
}
if class.negated {
byte_set = byte_set.complement();
}
self.positions.push(byte_set);
self.follow.push(0);
Some(ExprInfo {
first: 1u64 << pos,
last: 1u64 << pos,
nullable: false,
})
}
HirExpr::Concat(exprs) => {
if exprs.is_empty() {
return Some(ExprInfo::empty());
}
let mut result = self.build_expr(&exprs[0])?;
for expr in &exprs[1..] {
let next = self.build_expr(expr)?;
for pos in 0..MAX_POSITIONS {
if (result.last >> pos) & 1 != 0 && pos < self.follow.len() {
self.follow[pos] |= next.first;
}
}
let new_first = if result.nullable {
result.first | next.first
} else {
result.first
};
let new_last = if next.nullable {
next.last | result.last
} else {
next.last
};
result = ExprInfo {
first: new_first,
last: new_last,
nullable: result.nullable && next.nullable,
};
}
Some(result)
}
HirExpr::Alt(exprs) => {
if exprs.is_empty() {
return Some(ExprInfo::empty());
}
let mut result = self.build_expr(&exprs[0])?;
for expr in &exprs[1..] {
let alt = self.build_expr(expr)?;
result = ExprInfo {
first: result.first | alt.first,
last: result.last | alt.last,
nullable: result.nullable || alt.nullable,
};
}
Some(result)
}
HirExpr::Repeat(rep) => {
let min = rep.min;
let max = rep.max;
match (min, max) {
(0, Some(1)) => {
let inner = self.build_expr(&rep.expr)?;
Some(ExprInfo {
first: inner.first,
last: inner.last,
nullable: true,
})
}
(0, None) => {
let inner = self.build_expr(&rep.expr)?;
for pos in 0..MAX_POSITIONS {
if (inner.last >> pos) & 1 != 0 && pos < self.follow.len() {
self.follow[pos] |= inner.first;
}
}
Some(ExprInfo {
first: inner.first,
last: inner.last,
nullable: true,
})
}
(1, None) => {
let inner = self.build_expr(&rep.expr)?;
for pos in 0..MAX_POSITIONS {
if (inner.last >> pos) & 1 != 0 && pos < self.follow.len() {
self.follow[pos] |= inner.first;
}
}
Some(ExprInfo {
first: inner.first,
last: inner.last,
nullable: inner.nullable,
})
}
_ => self.build_bounded_repeat(&rep.expr, min, max),
}
}
HirExpr::Capture(cap) => self.build_expr(&cap.expr),
HirExpr::Anchor(_) => Some(ExprInfo::empty()),
HirExpr::Lookaround(_) | HirExpr::Backref(_) => None,
HirExpr::UnicodeCpClass(_) => None,
}
}
fn build_bounded_repeat(
&mut self,
expr: &HirExpr,
min: u32,
max: Option<u32>,
) -> Option<ExprInfo> {
if min == 0 {
return self.build_optional_repeat(expr, max.unwrap_or(1) as usize);
}
let min = min as usize;
let mut result = self.build_expr(expr)?;
let mut prev_last = result.last;
for _ in 1..min {
let copy = self.build_expr(expr)?;
for pos in 0..MAX_POSITIONS {
if (prev_last >> pos) & 1 != 0 && pos < self.follow.len() {
self.follow[pos] |= copy.first;
}
}
result.last = copy.last;
result.nullable = result.nullable && copy.nullable;
prev_last = copy.last;
}
match max {
None => {
for pos in 0..MAX_POSITIONS {
if (result.last >> pos) & 1 != 0 && pos < self.follow.len() {
self.follow[pos] |= result.last;
}
}
Some(result)
}
Some(max_val) => {
let max = max_val as usize;
let optional_count = max - min;
if optional_count == 0 {
return Some(result);
}
let mut can_end = result.last;
for _ in 0..optional_count {
let copy = self.build_expr(expr)?;
for pos in 0..MAX_POSITIONS {
if (prev_last >> pos) & 1 != 0 && pos < self.follow.len() {
self.follow[pos] |= copy.first;
}
}
prev_last = copy.last;
can_end |= copy.last; }
result.last = can_end;
Some(result)
}
}
}
fn build_optional_repeat(&mut self, expr: &HirExpr, max: usize) -> Option<ExprInfo> {
if max == 0 {
return Some(ExprInfo::empty());
}
let first_copy = self.build_expr(expr)?;
let mut result = ExprInfo {
first: first_copy.first,
last: first_copy.last,
nullable: true, };
let mut prev_last = first_copy.last;
for _ in 1..max {
let copy = self.build_expr(expr)?;
for pos in 0..MAX_POSITIONS {
if (prev_last >> pos) & 1 != 0 && pos < self.follow.len() {
self.follow[pos] |= copy.first;
}
}
result.last |= copy.last;
prev_last = copy.last;
}
Some(result)
}
}
impl Default for GlushkovBuilder {
fn default() -> Self {
Self::new()
}
}
impl GlushkovNfa {
pub fn is_shift_or_compatible(&self) -> bool {
self.position_count <= MAX_POSITIONS
}
pub fn build_shift_or_masks(&self) -> [u64; 256] {
let mut masks = [!0u64; 256];
for (pos_idx, byte_set) in self.positions.iter().enumerate() {
for byte in 0..=255u8 {
if byte_set.contains(byte) {
masks[byte as usize] &= !(1u64 << pos_idx);
}
}
}
masks
}
pub fn build_initial_state(&self) -> u64 {
!0u64
}
pub fn build_accept_mask(&self) -> u64 {
!self.last
}
}
pub fn compile_glushkov(hir: &Hir) -> Option<GlushkovNfa> {
let mut builder = GlushkovBuilder::new();
builder.build(hir)
}
#[derive(Debug, Clone)]
pub struct GlushkovWideNfa {
pub positions: Vec<ByteSet>,
pub follow: Vec<BitSet256>,
pub first: BitSet256,
pub last: BitSet256,
pub nullable: bool,
pub position_count: usize,
}
impl GlushkovWideNfa {
pub fn is_shift_or_wide_compatible(&self) -> bool {
self.position_count <= MAX_POSITIONS_WIDE && self.position_count > 0
}
pub fn build_shift_or_masks(&self) -> [BitSet256; 256] {
let mut masks = [BitSet256::all_ones(); 256];
for (pos_idx, byte_set) in self.positions.iter().enumerate() {
for byte in 0..=255u8 {
if byte_set.contains(byte) {
masks[byte as usize].clear(pos_idx);
}
}
}
masks
}
pub fn build_accept_mask(&self) -> BitSet256 {
self.last.complement()
}
}
#[derive(Debug, Clone)]
struct WideExprInfo {
first: BitSet256,
last: BitSet256,
nullable: bool,
}
impl WideExprInfo {
fn empty() -> Self {
Self {
first: BitSet256::empty(),
last: BitSet256::empty(),
nullable: true,
}
}
}
pub struct GlushkovWideBuilder {
positions: Vec<ByteSet>,
follow: Vec<BitSet256>,
}
impl GlushkovWideBuilder {
pub fn new() -> Self {
Self {
positions: Vec::new(),
follow: Vec::new(),
}
}
pub fn build(&mut self, hir: &Hir) -> Option<GlushkovWideNfa> {
if hir.props.has_backrefs || hir.props.has_lookaround {
return None;
}
self.positions.clear();
self.follow.clear();
let info = self.build_expr(&hir.expr)?;
if self.positions.len() > MAX_POSITIONS_WIDE {
return None;
}
Some(GlushkovWideNfa {
positions: self.positions.clone(),
follow: self.follow.clone(),
first: info.first,
last: info.last,
nullable: info.nullable,
position_count: self.positions.len(),
})
}
fn build_expr(&mut self, expr: &HirExpr) -> Option<WideExprInfo> {
match expr {
HirExpr::Empty => Some(WideExprInfo::empty()),
HirExpr::Literal(bytes) => {
if bytes.is_empty() {
return Some(WideExprInfo::empty());
}
let first_pos = self.positions.len();
for &b in bytes {
if self.positions.len() >= MAX_POSITIONS_WIDE {
return None;
}
self.positions.push(ByteSet::singleton(b));
self.follow.push(BitSet256::empty());
}
let last_pos = self.positions.len() - 1;
for i in first_pos..last_pos {
self.follow[i].set(i + 1);
}
Some(WideExprInfo {
first: BitSet256::singleton(first_pos),
last: BitSet256::singleton(last_pos),
nullable: false,
})
}
HirExpr::Class(class) => {
if self.positions.len() >= MAX_POSITIONS_WIDE {
return None;
}
let pos = self.positions.len();
let mut byte_set = ByteSet::new();
for &(start, end) in &class.ranges {
for b in start..=end {
byte_set.insert(b);
}
}
if class.negated {
byte_set = byte_set.complement();
}
self.positions.push(byte_set);
self.follow.push(BitSet256::empty());
Some(WideExprInfo {
first: BitSet256::singleton(pos),
last: BitSet256::singleton(pos),
nullable: false,
})
}
HirExpr::Concat(exprs) => {
if exprs.is_empty() {
return Some(WideExprInfo::empty());
}
let mut result = self.build_expr(&exprs[0])?;
for expr in &exprs[1..] {
let next = self.build_expr(expr)?;
for pos in result.last.iter_ones() {
if pos < self.follow.len() {
self.follow[pos].union_assign(next.first);
}
}
let new_first = if result.nullable {
result.first.union(next.first)
} else {
result.first
};
let new_last = if next.nullable {
next.last.union(result.last)
} else {
next.last
};
result = WideExprInfo {
first: new_first,
last: new_last,
nullable: result.nullable && next.nullable,
};
}
Some(result)
}
HirExpr::Alt(exprs) => {
if exprs.is_empty() {
return Some(WideExprInfo::empty());
}
let mut result = self.build_expr(&exprs[0])?;
for expr in &exprs[1..] {
let alt = self.build_expr(expr)?;
result = WideExprInfo {
first: result.first.union(alt.first),
last: result.last.union(alt.last),
nullable: result.nullable || alt.nullable,
};
}
Some(result)
}
HirExpr::Repeat(rep) => {
let min = rep.min;
let max = rep.max;
match (min, max) {
(0, Some(1)) => {
let inner = self.build_expr(&rep.expr)?;
Some(WideExprInfo {
first: inner.first,
last: inner.last,
nullable: true,
})
}
(0, None) => {
let inner = self.build_expr(&rep.expr)?;
for pos in inner.last.iter_ones() {
if pos < self.follow.len() {
self.follow[pos].union_assign(inner.first);
}
}
Some(WideExprInfo {
first: inner.first,
last: inner.last,
nullable: true,
})
}
(1, None) => {
let inner = self.build_expr(&rep.expr)?;
for pos in inner.last.iter_ones() {
if pos < self.follow.len() {
self.follow[pos].union_assign(inner.first);
}
}
Some(WideExprInfo {
first: inner.first,
last: inner.last,
nullable: inner.nullable,
})
}
_ => self.build_bounded_repeat_wide(&rep.expr, min, max),
}
}
HirExpr::Capture(cap) => self.build_expr(&cap.expr),
HirExpr::Anchor(_) => Some(WideExprInfo::empty()),
HirExpr::Lookaround(_) | HirExpr::Backref(_) => None,
HirExpr::UnicodeCpClass(_) => None,
}
}
fn build_bounded_repeat_wide(
&mut self,
expr: &HirExpr,
min: u32,
max: Option<u32>,
) -> Option<WideExprInfo> {
if min == 0 {
return self.build_optional_repeat_wide(expr, max.unwrap_or(1) as usize);
}
let min = min as usize;
let mut result = self.build_expr(expr)?;
let mut prev_last = result.last;
for _ in 1..min {
let copy = self.build_expr(expr)?;
for pos in prev_last.iter_ones() {
if pos < self.follow.len() {
self.follow[pos].union_assign(copy.first);
}
}
result.last = copy.last;
result.nullable = result.nullable && copy.nullable;
prev_last = copy.last;
}
match max {
None => {
for pos in result.last.iter_ones() {
if pos < self.follow.len() {
self.follow[pos].union_assign(result.last);
}
}
Some(result)
}
Some(max_val) => {
let max = max_val as usize;
let optional_count = max - min;
if optional_count == 0 {
return Some(result);
}
let mut can_end = result.last;
for _ in 0..optional_count {
let copy = self.build_expr(expr)?;
for pos in prev_last.iter_ones() {
if pos < self.follow.len() {
self.follow[pos].union_assign(copy.first);
}
}
prev_last = copy.last;
can_end = can_end.union(copy.last);
}
result.last = can_end;
Some(result)
}
}
}
fn build_optional_repeat_wide(&mut self, expr: &HirExpr, max: usize) -> Option<WideExprInfo> {
if max == 0 {
return Some(WideExprInfo::empty());
}
let first_copy = self.build_expr(expr)?;
let mut result = WideExprInfo {
first: first_copy.first,
last: first_copy.last,
nullable: true,
};
let mut prev_last = first_copy.last;
for _ in 1..max {
let copy = self.build_expr(expr)?;
for pos in prev_last.iter_ones() {
if pos < self.follow.len() {
self.follow[pos].union_assign(copy.first);
}
}
result.last = result.last.union(copy.last);
prev_last = copy.last;
}
Some(result)
}
}
impl Default for GlushkovWideBuilder {
fn default() -> Self {
Self::new()
}
}
pub fn compile_glushkov_wide(hir: &Hir) -> Option<GlushkovWideNfa> {
let mut builder = GlushkovWideBuilder::new();
builder.build(hir)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::hir::translate;
use crate::parser::parse;
fn make_glushkov(pattern: &str) -> Option<GlushkovNfa> {
let ast = parse(pattern).ok()?;
let hir = translate(&ast).ok()?;
compile_glushkov(&hir)
}
#[test]
fn test_simple_literal() {
let nfa = make_glushkov("abc").unwrap();
assert_eq!(nfa.position_count, 3);
assert!(!nfa.nullable);
assert!(nfa.positions[0].contains(b'a'));
assert!(nfa.positions[1].contains(b'b'));
assert!(nfa.positions[2].contains(b'c'));
}
#[test]
fn test_alternation() {
let nfa = make_glushkov("a|b").unwrap();
assert_eq!(nfa.position_count, 2);
assert!(!nfa.nullable);
assert_eq!(nfa.first, 0b11);
assert_eq!(nfa.last, 0b11);
}
#[test]
fn test_optional() {
let nfa = make_glushkov("a?").unwrap();
assert_eq!(nfa.position_count, 1);
assert!(nfa.nullable); }
#[test]
fn test_star() {
let nfa = make_glushkov("a*").unwrap();
assert_eq!(nfa.position_count, 1);
assert!(nfa.nullable); }
#[test]
fn test_plus() {
let nfa = make_glushkov("a+").unwrap();
assert_eq!(nfa.position_count, 1);
assert!(!nfa.nullable); }
#[test]
fn test_class() {
let nfa = make_glushkov("[a-z]").unwrap();
assert_eq!(nfa.position_count, 1);
for c in b'a'..=b'z' {
assert!(nfa.positions[0].contains(c));
}
assert!(!nfa.positions[0].contains(b'A'));
}
#[test]
fn test_too_many_positions() {
let pattern = "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnop";
assert!(pattern.len() > MAX_POSITIONS);
let result = make_glushkov(pattern);
assert!(result.is_none());
}
#[test]
fn test_shift_or_masks() {
let nfa = make_glushkov("ab").unwrap();
let masks = nfa.build_shift_or_masks();
assert_eq!(masks[b'a' as usize] & 1, 0);
assert_eq!(masks[b'b' as usize] & 2, 0);
assert_eq!(masks[b'c' as usize], !0u64);
}
#[test]
fn test_dot_star_glushkov() {
let nfa = make_glushkov("a.*b").unwrap();
assert_eq!(nfa.position_count, 3);
assert_eq!(nfa.first, 0b001);
assert_eq!(nfa.last, 0b100);
assert!(!nfa.nullable);
assert!(nfa.positions[0].contains(b'a'));
assert!(!nfa.positions[0].contains(b'b'));
assert!(nfa.positions[1].contains(b'a'));
assert!(nfa.positions[1].contains(b'b'));
assert!(nfa.positions[1].contains(b'x'));
assert!(nfa.positions[2].contains(b'b'));
assert!(!nfa.positions[2].contains(b'a'));
assert_eq!(
nfa.follow[0] & 0b110,
0b110,
"Follow(0) should include positions 1 and 2"
);
assert_eq!(
nfa.follow[1] & 0b110,
0b110,
"Follow(1) should include positions 1 and 2"
);
assert_eq!(nfa.follow[2], 0, "Follow(2) should be empty");
}
}