use crate::hir::Hir;
use crate::nfa::{
compile_glushkov, compile_glushkov_wide, BitSet256, GlushkovNfa, GlushkovWideNfa,
MAX_POSITIONS, MAX_POSITIONS_WIDE,
};
#[derive(Debug)]
pub struct ShiftOr {
pub(crate) masks: [u64; 256],
pub(crate) accept: u64,
pub(crate) first: u64,
pub(crate) follow: Vec<u64>,
pub(crate) nullable: bool,
pub(crate) position_count: usize,
pub(crate) has_leading_word_boundary: bool,
pub(crate) has_trailing_word_boundary: bool,
pub(crate) has_start_anchor: bool,
pub(crate) has_end_anchor: bool,
}
impl ShiftOr {
pub fn from_hir(hir: &Hir) -> Option<Self> {
if hir.props.has_backrefs
|| hir.props.has_lookaround
|| hir.props.has_anchors
|| hir.props.has_word_boundary
|| hir.props.has_non_greedy
{
return None;
}
let glushkov = compile_glushkov(hir)?;
Self::from_glushkov_with_boundaries(&glushkov, false, false)
}
pub fn from_hir_with_anchors(hir: &Hir) -> Option<Self> {
if hir.props.has_backrefs
|| hir.props.has_lookaround
|| hir.props.has_word_boundary
|| hir.props.has_non_greedy
{
return None;
}
let glushkov = compile_glushkov(hir)?;
let has_start_anchor = hir.props.has_start_anchor;
let has_end_anchor = hir.props.has_end_anchor;
Self::from_glushkov_with_options(&glushkov, false, false, has_start_anchor, has_end_anchor)
}
pub fn from_glushkov(nfa: &GlushkovNfa) -> Option<Self> {
Self::from_glushkov_with_options(nfa, false, false, false, false)
}
fn from_glushkov_with_boundaries(
nfa: &GlushkovNfa,
has_leading_word_boundary: bool,
has_trailing_word_boundary: bool,
) -> Option<Self> {
Self::from_glushkov_with_options(
nfa,
has_leading_word_boundary,
has_trailing_word_boundary,
false,
false,
)
}
fn from_glushkov_with_options(
nfa: &GlushkovNfa,
has_leading_word_boundary: bool,
has_trailing_word_boundary: bool,
has_start_anchor: bool,
has_end_anchor: bool,
) -> Option<Self> {
if nfa.position_count > MAX_POSITIONS || nfa.position_count == 0 {
return None;
}
let masks = nfa.build_shift_or_masks();
let accept = nfa.build_accept_mask();
Some(Self {
masks,
accept,
first: nfa.first,
follow: nfa.follow.clone(),
nullable: nfa.nullable,
position_count: nfa.position_count,
has_leading_word_boundary,
has_trailing_word_boundary,
has_start_anchor,
has_end_anchor,
})
}
#[inline]
pub fn has_word_boundary(&self) -> bool {
self.has_leading_word_boundary || self.has_trailing_word_boundary
}
pub fn state_count(&self) -> usize {
self.position_count
}
pub fn masks(&self) -> &[u64; 256] {
&self.masks
}
pub fn accept(&self) -> u64 {
self.accept
}
pub fn first(&self) -> u64 {
self.first
}
pub fn follow(&self) -> &[u64] {
&self.follow
}
pub fn is_nullable(&self) -> bool {
self.nullable
}
pub fn has_leading_word_boundary(&self) -> bool {
self.has_leading_word_boundary
}
pub fn has_trailing_word_boundary(&self) -> bool {
self.has_trailing_word_boundary
}
pub fn is_match(&self, input: &[u8]) -> bool {
self.find(input).is_some()
}
pub fn find(&self, input: &[u8]) -> Option<(usize, usize)> {
if self.has_start_anchor {
if let Some(end) = self.match_at(input, 0) {
if self.has_end_anchor && end != input.len() {
return None;
}
return Some((0, end));
}
if self.nullable && (!self.has_end_anchor || input.is_empty()) {
return Some((0, 0));
}
return None;
}
for start in 0..=input.len() {
if let Some(end) = self.match_at(input, start) {
if self.has_end_anchor && end != input.len() {
continue;
}
return Some((start, end));
}
}
if self.nullable && !self.has_end_anchor {
return Some((0, 0));
}
None
}
pub fn find_at(&self, input: &[u8], pos: usize) -> Option<(usize, usize)> {
if pos > input.len() {
return None;
}
if self.has_start_anchor && pos > 0 {
return None;
}
let search_start = if self.has_start_anchor { 0 } else { pos };
for start in search_start..=input.len() {
if let Some(end) = self.match_at(input, start) {
if self.has_end_anchor && end != input.len() {
if self.has_start_anchor {
return None;
}
continue;
}
return Some((start, end));
}
if self.has_start_anchor {
break;
}
}
None
}
pub fn try_match_at(&self, input: &[u8], pos: usize) -> Option<(usize, usize)> {
if self.has_start_anchor && pos != 0 {
return None;
}
match self.match_at(input, pos) {
Some(end) => {
if self.has_end_anchor && end != input.len() {
return None;
}
Some((pos, end))
}
None => None,
}
}
fn match_at(&self, input: &[u8], start: usize) -> Option<usize> {
if start > input.len() {
return None;
}
let mut last_match = None;
if self.nullable {
last_match = Some(start);
}
let mut state = !0u64;
for (i, &byte) in input[start..].iter().enumerate() {
let byte_mask = self.masks[byte as usize];
if i == 0 {
state = (!self.first) | byte_mask;
} else {
let mut active = !state;
let mut reachable = 0u64;
while active != 0 {
let pos = active.trailing_zeros() as usize;
reachable |= self.follow[pos];
active &= active - 1; }
state = (!reachable) | byte_mask;
}
if (state | self.accept) != !0u64 {
last_match = Some(start + i + 1);
}
if state == !0u64 {
break;
}
}
last_match
}
}
pub fn is_shift_or_compatible(hir: &Hir) -> bool {
if hir.props.has_backrefs
|| hir.props.has_lookaround
|| hir.props.has_multiline_anchors
|| hir.props.has_word_boundary
|| hir.props.has_non_greedy
{
return false;
}
compile_glushkov(hir)
.map(|nfa| nfa.position_count <= MAX_POSITIONS && nfa.position_count > 0)
.unwrap_or(false)
}
#[derive(Debug)]
pub struct ShiftOrWide {
pub(crate) masks: Box<[BitSet256; 256]>,
pub(crate) accept: BitSet256,
pub(crate) first: BitSet256,
pub(crate) follow: Vec<BitSet256>,
pub(crate) nullable: bool,
pub(crate) position_count: usize,
}
impl ShiftOrWide {
pub fn from_hir(hir: &Hir) -> Option<Self> {
if hir.props.has_backrefs
|| hir.props.has_lookaround
|| hir.props.has_anchors
|| hir.props.has_word_boundary
|| hir.props.has_non_greedy
{
return None;
}
let glushkov = compile_glushkov_wide(hir)?;
Self::from_glushkov(&glushkov)
}
pub fn from_glushkov(nfa: &GlushkovWideNfa) -> Option<Self> {
if nfa.position_count > MAX_POSITIONS_WIDE || nfa.position_count == 0 {
return None;
}
let masks = Box::new(nfa.build_shift_or_masks());
let accept = nfa.build_accept_mask();
Some(Self {
masks,
accept,
first: nfa.first,
follow: nfa.follow.clone(),
nullable: nfa.nullable,
position_count: nfa.position_count,
})
}
pub fn state_count(&self) -> usize {
self.position_count
}
pub fn is_nullable(&self) -> bool {
self.nullable
}
pub fn is_match(&self, input: &[u8]) -> bool {
self.find(input).is_some()
}
pub fn find(&self, input: &[u8]) -> Option<(usize, usize)> {
for start in 0..=input.len() {
if let Some(end) = self.match_at(input, start) {
return Some((start, end));
}
}
if self.nullable {
return Some((0, 0));
}
None
}
pub fn find_at(&self, input: &[u8], pos: usize) -> Option<(usize, usize)> {
if pos > input.len() {
return None;
}
for start in pos..=input.len() {
if let Some(end) = self.match_at(input, start) {
return Some((start, end));
}
}
None
}
pub fn try_match_at(&self, input: &[u8], pos: usize) -> Option<(usize, usize)> {
self.match_at(input, pos).map(|end| (pos, end))
}
fn match_at(&self, input: &[u8], start: usize) -> Option<usize> {
if start > input.len() {
return None;
}
let mut last_match = None;
if self.nullable {
last_match = Some(start);
}
let mut state = BitSet256::all_ones();
for (i, &byte) in input[start..].iter().enumerate() {
let byte_mask = self.masks[byte as usize];
if i == 0 {
state = self.first.complement().union(byte_mask);
} else {
let active = state.complement();
let mut reachable = BitSet256::empty();
for word_idx in 0..4 {
let mut word = active.parts[word_idx];
while word != 0 {
let bit_idx = word.trailing_zeros() as usize;
let pos = word_idx * 64 + bit_idx;
if pos < self.follow.len() {
reachable.union_assign(self.follow[pos]);
}
word &= word - 1; }
}
state = reachable.complement().union(byte_mask);
}
if !state.union(self.accept).is_all_ones() {
last_match = Some(start + i + 1);
}
if state.is_all_ones() {
break;
}
}
last_match
}
}
pub fn is_shift_or_wide_compatible(hir: &Hir) -> bool {
if hir.props.has_backrefs
|| hir.props.has_lookaround
|| hir.props.has_anchors
|| hir.props.has_word_boundary
|| hir.props.has_non_greedy
{
return false;
}
compile_glushkov_wide(hir)
.map(|nfa| {
nfa.position_count > MAX_POSITIONS
&& nfa.position_count <= MAX_POSITIONS_WIDE
&& nfa.position_count > 0
})
.unwrap_or(false)
}