use std::{fmt::Display, rc::Rc};
use crate::query::ast::Query;
use crate::query::common::TransitionLabel;
pub struct QueryNFA {
pub num_states: usize,
pub transitions: Vec<Vec<(usize, usize)>>,
pub pos_to_label: Vec<TransitionLabel>,
pub start_state: usize,
pub is_accepting: Vec<bool>,
pub is_first: Vec<bool>,
pub is_ending: Vec<bool>,
pub factors: Vec<Vec<usize>>,
pub contains_empty_word: bool,
}
impl Display for QueryNFA {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
writeln!(f, "NFA States: {}", self.num_states)?;
writeln!(f, "Start State: {}", self.start_state)?;
writeln!(f, "Accepting States: {:?}", {
self.is_accepting
.iter()
.enumerate()
.filter_map(|(i, &b)| if b { Some(i) } else { None })
.collect::<Vec<_>>()
})?;
writeln!(
f,
"First set: {:?}",
self.is_first
.iter()
.enumerate()
.filter_map(|(i, &b)| if b {
Some(format!("[{}] {}", i, &self.pos_to_label[i]))
} else {
None
})
.collect::<Vec<_>>()
)?;
writeln!(
f,
"Last set: {:?}",
self.is_ending
.iter()
.enumerate()
.filter_map(|(i, &b)| if b {
Some(format!("[{}] {}", i, &self.pos_to_label[i]))
} else {
None
})
.collect::<Vec<_>>()
)?;
writeln!(f, "Factors set:")?;
for (i, followers) in self.factors.iter().enumerate() {
if followers.is_empty() {
writeln!(
f,
"\t[{}] {} cannot be followed",
i, &self.pos_to_label[i]
)?;
continue;
}
writeln!(
f,
"\t[{}] {} can be followed by:",
i, &self.pos_to_label[i]
)?;
for &j in followers {
writeln!(f, "\t\t[{}] {}", j, &self.pos_to_label[j])?;
}
}
writeln!(f, "Transitions:")?;
for (st, row) in self.transitions.iter().enumerate() {
writeln!(f, "\tstate {st}:")?;
for (label_idx, dest) in row {
writeln!(
f,
"\t\ton [{}] {} -> [{}]",
*label_idx, self.pos_to_label[*label_idx], dest
)?;
}
}
Ok(())
}
}
impl QueryNFA {
#[must_use]
pub fn from_query(query: &Query) -> Self {
let mut temp_nfa = Self {
num_states: 1, transitions: Vec::new(),
pos_to_label: Vec::new(),
start_state: 0,
is_accepting: vec![false; 1], is_first: Vec::new(),
is_ending: Vec::new(),
factors: Vec::new(),
contains_empty_word: false,
};
temp_nfa.linearize_query(query);
let alphabet_size = temp_nfa.pos_to_label.len();
if alphabet_size == 0 {
let empty_nfa = Self {
num_states: 1, transitions: Vec::new(),
pos_to_label: Vec::new(),
start_state: 0,
is_accepting: vec![true; 1],
is_first: Vec::new(),
is_ending: Vec::new(),
factors: Vec::new(),
contains_empty_word: true,
};
#[cfg(test)]
println!("Constructed NFA for `{query}`:\n{empty_nfa}");
return empty_nfa;
}
temp_nfa.num_states += alphabet_size;
temp_nfa.contains_empty_word = contains_empty_word(query);
let mut position: usize;
position = 0;
temp_nfa.is_first = vec![false; alphabet_size];
compute_first_set(&mut temp_nfa.is_first, query, &mut position);
position = 0;
temp_nfa.is_ending = vec![false; alphabet_size];
compute_last_set(&mut temp_nfa.is_ending, query, &mut position);
position = 0;
temp_nfa.factors = vec![Vec::new(); alphabet_size];
compute_follows_set(&mut temp_nfa.factors, query, &mut position);
temp_nfa.transitions = vec![Vec::new(); 1 + alphabet_size];
temp_nfa.is_accepting = vec![false; 1 + alphabet_size];
let nfa = temp_nfa.construct_nfa();
#[cfg(test)]
println!("Constructed NFA for `{query}`:\n{nfa}");
nfa
}
fn linearize_query(&mut self, query: &Query) {
match query {
Query::Field(name) => {
let name_rc: Rc<String> = Rc::new(name.clone());
self.pos_to_label.push(TransitionLabel::Field(name_rc));
}
Query::FieldWildcard => {
let field_wildcard = TransitionLabel::FieldWildcard;
self.pos_to_label.push(field_wildcard);
}
Query::Index(idx) => {
let range = TransitionLabel::Range(*idx, *idx + 1);
self.pos_to_label.push(range);
}
Query::Range(s, e) => {
let range = TransitionLabel::Range(
(*s).unwrap_or(0),
(*e).unwrap_or(usize::MAX),
);
self.pos_to_label.push(range);
}
Query::RangeFrom(s) => {
self.pos_to_label.push(TransitionLabel::RangeFrom(*s));
}
Query::ArrayWildcard => {
let range = TransitionLabel::Range(0, usize::MAX);
self.pos_to_label.push(range);
}
Query::Disjunction(queries) | Query::Sequence(queries) => {
for q in queries {
self.linearize_query(q);
}
}
Query::KleeneStar(q) | Query::Optional(q) => {
self.linearize_query(q);
}
Query::Regex(_) => unimplemented!(),
}
}
#[must_use]
pub fn construct_nfa(&mut self) -> Self {
if self.contains_empty_word {
self.is_accepting[0] = true;
}
for (pos, &is_first) in self.is_first.iter().enumerate() {
if is_first {
self.transitions[0].push((pos, pos + 1));
}
}
for (pos, &is_ending) in self.is_ending.iter().enumerate() {
if is_ending {
self.is_accepting[pos + 1] = true; }
}
for (from_state, followers) in self.factors.iter().enumerate() {
for &follower in followers {
self.transitions[from_state + 1]
.push((follower, follower + 1));
}
}
Self {
num_states: self.num_states,
transitions: std::mem::take(&mut self.transitions),
pos_to_label: std::mem::take(&mut self.pos_to_label),
start_state: self.start_state,
is_accepting: std::mem::take(&mut self.is_accepting),
is_first: std::mem::take(&mut self.is_first),
is_ending: std::mem::take(&mut self.is_ending),
factors: std::mem::take(&mut self.factors),
contains_empty_word: self.contains_empty_word,
}
}
}
pub fn contains_empty_word(query: &Query) -> bool {
match query {
Query::Field(_)
| Query::Index(_)
| Query::Range(_, _)
| Query::RangeFrom(_)
| Query::ArrayWildcard
| Query::FieldWildcard => false,
Query::Sequence(queries) => queries.iter().all(contains_empty_word),
Query::Disjunction(queries) => queries.iter().any(contains_empty_word),
Query::Optional(_) | Query::KleeneStar(_) => true,
Query::Regex(_) => unimplemented!(),
}
}
pub fn compute_first_set(
first_set: &mut [bool],
query: &Query,
position: &mut usize,
) {
match query {
Query::Field(_)
| Query::Index(_)
| Query::Range(_, _)
| Query::RangeFrom(_)
| Query::ArrayWildcard
| Query::FieldWildcard
| Query::Regex(_) => {
if *position < first_set.len() {
first_set[*position] = true;
*position += 1;
}
}
Query::Disjunction(queries) => {
for q in queries {
let start_pos = *position;
let branch_len = count_subquery_positions(q);
compute_first_set(first_set, q, position);
*position = start_pos + branch_len;
}
}
Query::Sequence(queries) => {
for q in queries {
compute_first_set(first_set, q, position);
if !contains_empty_word(q) {
break;
}
}
}
Query::KleeneStar(q) | Query::Optional(q) => {
compute_first_set(first_set, q, position);
}
}
}
pub fn compute_last_set(
last_set: &mut [bool],
query: &Query,
position: &mut usize,
) {
match query {
Query::Field(_)
| Query::Index(_)
| Query::Range(_, _)
| Query::RangeFrom(_)
| Query::ArrayWildcard
| Query::FieldWildcard
| Query::Regex(_) => {
if *position < last_set.len() {
last_set[*position] = true;
*position += 1;
}
}
Query::Disjunction(queries) => {
for q in queries {
let start_pos = *position;
let branch_len = count_subquery_positions(q);
compute_last_set(last_set, q, position);
*position = start_pos + branch_len;
}
}
Query::Sequence(queries) => {
let subquery_lengths: Vec<usize> =
queries.iter().map(count_subquery_positions).collect();
let seq_start_pos = *position;
for (i, q) in queries.iter().enumerate().rev() {
let mut subquery_start =
seq_start_pos + subquery_lengths[..i].iter().sum::<usize>();
compute_last_set(last_set, q, &mut subquery_start);
if !contains_empty_word(q) {
break;
}
}
*position = seq_start_pos + subquery_lengths.iter().sum::<usize>();
}
Query::KleeneStar(q) | Query::Optional(q) => {
compute_last_set(last_set, q, position);
}
}
}
fn count_subquery_positions(query: &Query) -> usize {
match query {
Query::Field(_)
| Query::Index(_)
| Query::Range(_, _)
| Query::RangeFrom(_)
| Query::ArrayWildcard
| Query::FieldWildcard => 1,
Query::Sequence(queries) | Query::Disjunction(queries) => {
queries.iter().map(count_subquery_positions).sum()
}
Query::Optional(q) | Query::KleeneStar(q) => {
count_subquery_positions(q)
}
Query::Regex(_) => unimplemented!(),
}
}
pub fn compute_follows_set(
factors: &mut [Vec<usize>],
query: &Query,
position: &mut usize,
) {
match query {
Query::Field(_)
| Query::Index(_)
| Query::Range(_, _)
| Query::RangeFrom(_)
| Query::ArrayWildcard
| Query::FieldWildcard
| Query::Regex(_) => {
*position += 1;
}
Query::Disjunction(queries) => {
for q in queries {
compute_follows_set(factors, q, position);
}
}
Query::Sequence(queries) => {
let mut subquery_ranges = Vec::new();
for q in queries {
let sub_start = *position;
let sub_len = count_subquery_positions(q);
compute_follows_set(factors, q, position);
subquery_ranges.push((sub_start, sub_start + sub_len));
}
for i in 0..queries.len() {
let left_query = &queries[i];
let (left_start, left_end) = subquery_ranges[i];
let mut left_last = vec![false; factors.len()];
let mut left_pos = left_start;
compute_last_set(&mut left_last, left_query, &mut left_pos);
for j in (i + 1)..queries.len() {
let can_skip_middle =
(i + 1..j).all(|k| contains_empty_word(&queries[k]));
if can_skip_middle {
let right_query = &queries[j];
let (right_start, right_end) = subquery_ranges[j];
let mut right_first = vec![false; factors.len()];
let mut right_pos = right_start;
compute_first_set(
&mut right_first,
right_query,
&mut right_pos,
);
for left_idx in left_start..left_end {
if left_last[left_idx] {
(right_start..right_end).for_each(
|right_idx| {
if right_first[right_idx] {
factors[left_idx].push(right_idx);
}
},
);
}
}
}
if !contains_empty_word(&queries[j]) {
break;
}
}
}
}
Query::KleeneStar(q) => {
let start_pos = *position;
let q_len = count_subquery_positions(q);
compute_follows_set(factors, q, position);
let mut last_set = vec![false; factors.len()];
let mut last_pos = start_pos;
compute_last_set(&mut last_set, q, &mut last_pos);
let mut first_set = vec![false; factors.len()];
let mut first_pos = start_pos;
compute_first_set(&mut first_set, q, &mut first_pos);
for i in start_pos..start_pos + q_len {
if last_set[i] {
(start_pos..start_pos + q_len).for_each(|j| {
if first_set[j] {
factors[i].push(j);
}
});
}
}
}
Query::Optional(q) => {
compute_follows_set(factors, q, position);
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::query::QueryBuilder;
fn number_of_members(bitset: &[bool]) -> usize {
bitset.iter().fold(0, |acc, b| if *b { acc + 1 } else { acc })
}
#[test]
fn empty_query_nfa() {
let query = QueryBuilder::new().build();
let nfa = QueryNFA::from_query(&query);
assert_eq!(nfa.num_states, 1);
assert!(nfa.contains_empty_word);
}
#[test]
fn simple_field_nfa() {
let query = QueryBuilder::new().field("foo").build();
let nfa = QueryNFA::from_query(&query);
assert_eq!(nfa.num_states, 2); assert_eq!(nfa.pos_to_label.len(), 1);
assert!(!nfa.contains_empty_word);
assert!(nfa.is_first[0]);
assert_eq!(number_of_members(&nfa.is_first), 1);
assert!(nfa.is_ending[0]);
assert_eq!(number_of_members(&nfa.is_ending), 1);
}
#[test]
fn simple_optional_field_nfa() {
let query = QueryBuilder::new().field("foo").optional().build();
let nfa = QueryNFA::from_query(&query);
assert_eq!(nfa.num_states, 2); assert_eq!(nfa.pos_to_label.len(), 1);
assert!(nfa.contains_empty_word);
assert_eq!(number_of_members(&nfa.is_accepting), 2);
assert_eq!(number_of_members(&nfa.is_first), 1);
assert_eq!(number_of_members(&nfa.is_ending), 1);
assert!(nfa.is_first[0]);
assert!(nfa.is_ending[0]);
}
#[test]
fn simple_seq_nfa() {
let query =
QueryBuilder::new().field("foo").field("bar").field("baz").build();
let nfa = QueryNFA::from_query(&query);
assert_eq!(number_of_members(&nfa.is_accepting), 1);
assert_eq!(number_of_members(&nfa.is_first), 1);
assert!(nfa.is_first[0]);
assert_eq!(number_of_members(&nfa.is_ending), 1);
assert!(nfa.is_ending[2]);
}
#[test]
fn simple_seq_dis_nfa() {
let query1 =
QueryBuilder::new().field("foo").field("bar").field("baz").build();
let query2 =
QueryBuilder::new().field("x").field("y").field("z").build();
let query =
QueryBuilder::new().disjunction(vec![query1, query2]).build();
let nfa = QueryNFA::from_query(&query);
assert_eq!(number_of_members(&nfa.is_accepting), 2);
assert_eq!(number_of_members(&nfa.is_first), 2);
assert!(nfa.is_first[0]);
assert!(nfa.is_first[3]);
assert_eq!(number_of_members(&nfa.is_ending), 2);
assert!(nfa.is_ending[2]);
assert!(nfa.is_ending[5]);
}
#[test]
fn simple_disjunction_nfa() {
let query1 = QueryBuilder::new().field("foo").build();
let query2 = QueryBuilder::new().field("bar").build();
let query =
QueryBuilder::new().disjunction(vec![query1, query2]).build();
let nfa = QueryNFA::from_query(&query);
assert_eq!(number_of_members(&nfa.is_accepting), 2);
assert_eq!(number_of_members(&nfa.is_first), 2);
assert!(&nfa.is_first[0]); assert!(&nfa.is_first[1]);
assert_eq!(number_of_members(&nfa.is_ending), 2);
assert!(nfa.is_ending[0]);
assert!(nfa.is_ending[1]);
}
#[test]
fn field_branch_disjunction_nfa() {
let query1 = QueryBuilder::new().field("foo").field("a").build();
let query2 = QueryBuilder::new().field("foo").field("b").build();
let query =
QueryBuilder::new().disjunction(vec![query1, query2]).build();
let nfa = QueryNFA::from_query(&query);
assert_eq!(number_of_members(&nfa.is_accepting), 2);
assert_eq!(number_of_members(&nfa.is_first), 2); assert!(&nfa.is_first[0]); assert!(&nfa.is_first[2]);
assert_eq!(number_of_members(&nfa.is_ending), 2);
assert!(&nfa.is_ending[1]);
assert!(&nfa.is_ending[3]);
}
#[test]
fn foobar_nfa() {
let query1 = QueryBuilder::new().field("foo").field("a").build();
let query2 = QueryBuilder::new().field("bar").field("b").build();
let query =
QueryBuilder::new().disjunction(vec![query1, query2]).build();
let nfa = QueryNFA::from_query(&query);
assert_eq!(number_of_members(&nfa.is_accepting), 2);
assert_eq!(number_of_members(&nfa.is_first), 2);
assert!(&nfa.is_first[0]);
assert!(&nfa.is_first[2]);
assert_eq!(number_of_members(&nfa.is_ending), 2);
assert!(&nfa.is_ending[1]);
assert!(&nfa.is_ending[3]);
}
#[test]
fn complex_disjunction_nfa() {
let query1 = QueryBuilder::new().field("foo").field("bar").build();
let query2 = QueryBuilder::new().field("bar").optional().build();
let query3 = QueryBuilder::new().field("baz").kleene_star().build();
let query = QueryBuilder::new()
.disjunction(vec![query1, query2, query3])
.build();
let nfa = QueryNFA::from_query(&query);
assert_eq!(number_of_members(&nfa.is_accepting), 4);
assert_eq!(number_of_members(&nfa.is_first), 3);
assert!(&nfa.is_first[0]); assert!(&nfa.is_first[2]); assert!(&nfa.is_first[3]);
assert_eq!(number_of_members(&nfa.is_ending), 3);
assert!(&nfa.is_ending[1]); assert!(&nfa.is_ending[2]); assert!(&nfa.is_ending[3]); }
#[test]
fn range_overlap_nfa() {
let query1 = QueryBuilder::new().field("foo").index(1).build();
let query2 = QueryBuilder::new().field("foo").array_wildcard().build();
let query =
QueryBuilder::new().disjunction(vec![query1, query2]).build();
let nfa = QueryNFA::from_query(&query);
assert_eq!(number_of_members(&nfa.is_accepting), 2);
assert_eq!(number_of_members(&nfa.is_first), 2);
assert!(&nfa.is_first[0]); assert!(&nfa.is_first[2]);
assert_eq!(number_of_members(&nfa.is_ending), 2);
assert!(&nfa.is_ending[1]); assert!(&nfa.is_ending[3]); }
#[test]
fn kleene_nfa() {
let query =
QueryBuilder::new().field("a").kleene_star().field("b").build();
let nfa = QueryNFA::from_query(&query);
assert_eq!(number_of_members(&nfa.is_accepting), 1);
assert_eq!(number_of_members(&nfa.is_first), 2); assert!(&nfa.is_first[0]); assert!(&nfa.is_first[1]);
assert_eq!(number_of_members(&nfa.is_ending), 1); assert!(&nfa.is_ending[1]); }
#[test]
fn multiple_optional_nfa() {
let query = QueryBuilder::new()
.field("a")
.kleene_star()
.field("b")
.optional()
.field("c")
.optional()
.build();
let nfa = QueryNFA::from_query(&query);
assert_eq!(number_of_members(&nfa.is_accepting), 4);
assert_eq!(number_of_members(&nfa.is_first), 3); assert!(&nfa.is_first[0]); assert!(&nfa.is_first[1]); assert!(&nfa.is_first[2]);
assert_eq!(number_of_members(&nfa.is_ending), 3); assert!(&nfa.is_ending[0]); assert!(&nfa.is_ending[1]); assert!(&nfa.is_ending[2]); }
#[test]
fn kleene_sequence_nfa() {
let query = QueryBuilder::new()
.field_wildcard()
.kleene_star()
.array_wildcard()
.kleene_star()
.array_wildcard()
.build();
let nfa = QueryNFA::from_query(&query);
assert!(
nfa.factors[0].contains(&2),
"FieldWildcard should be followed by second ArrayWildcard"
);
}
}