extern crate regex;
use std::collections::HashMap;
use regex::Regex;
use std::cmp::Ordering;
use std::marker::PhantomData;
use std::fmt::Debug;
#[derive(Debug)]
pub enum Associativity
{
Left,
Right,
}
#[derive(Debug)]
pub enum ParsingError
{
NotInGrammar,
Ambiguous,
BadToken,
}
#[derive(Clone,Debug)]
pub enum EarleyKind
{
Complete(usize,usize,usize), Scan(usize,usize), Predict(usize),}
#[derive(Clone,Debug)]
pub struct AmbiguityInfo<T>
{
states: Vec<State<T>>,
index: usize,
}
impl<T> Default for AmbiguityInfo<T>
{
fn default() -> Self
{
AmbiguityInfo{
states: vec![],
index: 0,
}
}
}
#[derive(Clone,Debug)]
pub struct State<T>
{
pub rule: usize,
pub left: usize,
pub right: Vec<usize>,
pub position: usize,
pub original_set: usize,
pub kind: EarleyKind,
pub values: Vec<T>,
pub computed_value: Option<T>,
pub ambiguity_info: AmbiguityInfo<T>,
}
impl<T:Clone+Default> State<T>
{
pub fn finished(&self)->bool
{
self.position==self.right.len()
}
pub fn next(&self)->usize
{
self.right[self.position]
}
pub fn try_next(&self)->Option<usize>
{
if self.position==self.right.len()
{
None
}
else
{
Some(self.right[self.position])
}
}
pub fn advance(&self)->State<T>
{
State{
rule: self.rule,
left:self.left,
right:self.right.clone(),
position:self.position+1,
original_set: self.original_set,
kind: self.kind.clone(),
values: Vec::default(),
computed_value: None,
ambiguity_info: self.ambiguity_info.clone(),
}
}
}
impl<T:Default> State<T>
{
pub fn new(rule:usize, left:usize, right:Vec<usize>, set:usize, kind:EarleyKind) -> State<T>
{
State{
rule,
left,
right,
position:0,
original_set:set,
kind,
values:Vec::new(),
computed_value: None,
ambiguity_info:AmbiguityInfo::default(),
}
}
}
pub struct StateSet<T>
{
pub states: Vec<State<T>>,
}
impl<T> StateSet<T>
{
pub fn predict(&mut self, mut state: State<T>)
{
for s in self.states.iter()
{
if s.rule==state.rule && s.position==state.position
{
return;
}
}
state.values.clear();
self.states.push(state);
}
}
pub trait ParsingTablesTrait<T>
{
fn initial()->usize;
fn match_some(parser:&mut Parser<T,Self>) -> Option<(usize,T)> where Self:Sized;
fn predict(parser:&mut Parser<T,Self>,index:usize,state_index:usize,token:usize) where Self:Sized;
fn compute_value(state:&mut State<T>);
fn table_terminal(token_index:usize)->bool;
fn table_priority(a:usize, b:usize) -> Option<Ordering>;
fn table_associativity(rule:usize) -> Option<Associativity>;
fn to_usize(token:&T) -> usize;
fn take_token(token:&mut T) -> T where T:Clone+Default,
{
if Self::table_terminal(Self::to_usize(token)){
token.clone()
} else {
std::mem::take(token)
}
}
}
pub struct Parser<'a,T,Tables:ParsingTablesTrait<T>>
{
pub sets: Vec<StateSet<T>>,
pub source: &'a str,
pub source_index: usize,
pub cursor: &'a str,
pub tokens: Vec<T>,
pub tokens_range: Vec<(usize,usize)>,
pub regex_map: HashMap<String,Regex>,
pub verbosity: u8,
pub phantom: PhantomData<Tables>,
}
impl<'a,T:Default+PartialEq+Clone+Debug,Tables:ParsingTablesTrait<T>> Parser<'a,T,Tables>
{
pub fn parse(source:&str, initial:Option<usize>, verbosity:u8) -> Result<T,ParsingError>
{
let mut initial_state=Tables::initial();
if let Some(x)=initial
{
initial_state=x;
}
let n = source.len(); let mut sets = Vec::with_capacity(n+1);
sets.push(
StateSet{
states:vec![ State{
rule:0,
left:0,
right:vec![initial_state],
position:0,
original_set: 0,
kind: EarleyKind::Predict(0),
values: vec![T::default()],
computed_value: None,
ambiguity_info: AmbiguityInfo::default(),
}]
}
);
let mut parser=Parser::<T,Tables>{
sets,
source,
source_index:0,
cursor: source,
tokens: Vec::with_capacity(n),
tokens_range: Vec::with_capacity(n),
regex_map: HashMap::new(),
verbosity,
phantom: PhantomData,
};
if verbosity >= 2 { println!("gramatica: Parser::parse created parser"); }
parser.tokenize()?;
if verbosity>=2 {
println!("gramatica: Parser::parse tokenized");
println!("gramatica: Parser::parse tokenized with {} tokens, filling {} bytes",parser.tokens.len(),parser.tokens.len()*std::mem::size_of::<T>());
}
parser.earley()
}
pub fn re(&mut self, regex:&'a str, source:&str) -> Option<(usize,String)>
{
let s=format!("^({})",regex);
let r=
{
if !self.regex_map.contains_key(&s)
{
let x=Regex::new(&s).expect("gramatica: could not build regex.");
self.regex_map.insert(s.clone(),x);
}
self.regex_map.get(&s).expect("gramatica: regex not in map even after inserting")
};
match r.captures(source)
{
Some(cap) =>
{
let m=cap.get(0).expect("gramatica: regex failed to capture, but said it did.");
Some((m.end(),m.as_str().to_string()))
},
None => None,
}
}
pub fn keyword(&mut self, key:&str, source:&str) -> Option<(usize,String)>
{
let mut key_chars=key.chars();
let mut source_chars=source.chars();
loop
{
match (key_chars.next(),source_chars.next())
{
(None,Some(d)) => match d
{
'a'..='z' | 'A'..='Z' | '_' => return None,
_ => return Some((key.len(),key.to_string())),
},
(None,None) => return Some((key.len(),key.to_string())),
(Some(c),Some(d)) => if c!=d { return None;},
_ => return None,
};
}
}
pub fn tokenize(&mut self) -> Result<(),ParsingError>
{
while self.source_index<self.source.len()
{
if self.verbosity>=3 { println!("gramatica: Parser::tokenize {index}/{total}",index=self.source_index,total=self.source.len()); }
match Tables::match_some(self)
{
None =>
{
let end = self.source_index+100.min(self.source.len());
let s=self.source[self.source_index..end].to_string();
println!("gramatica: Did not match anything '{}'",s);
return Err(ParsingError::BadToken);
},
Some((size,token)) =>
{
if T::default()!=token
{
self.tokens.push(token);
self.tokens_range.push((self.source_index,self.source_index+size));
}
self.source_index+=size;
self.cursor = self.cursor.split_at(size).1;
},
};
}
Ok(())
}
pub fn earley(&mut self) -> Result<T,ParsingError>
{
let n=self.tokens.len();
for index in 0..n+1
{
if self.verbosity>=3{ println!("gramatica: index={index}/{n}"); }
if self.sets[index].states.len()==0
{
println!("gramatica: unexpected token {:?}",self.tokens[index-1]);
let (start,end)=self.tokens_range[index-1];
let showing_start= if start>100 {start-100} else {0};
let showing_end= if end+100>=self.source.len() {self.source.len()} else {start+100};
println!("BEFORE<{}>",self.source[showing_start..start].to_string());
println!("THEN<{}>",self.source[start..end].to_string());
println!("AFTER<{}>",self.source[end..showing_end].to_string());
print!("Tokens=...");
let token_start = if index>20 { index-20 } else {0};
let token_end = if index+20>self.tokens.len() {self.tokens.len()} else { index+20 };
for i in token_start..index-1
{
print!("{:?},",self.tokens[i]);
}
print!("[{:?}],",self.tokens[index-1]);
for i in index..token_end
{
print!("{:?},",self.tokens[i]);
}
println!("");
return Err(ParsingError::NotInGrammar);
}
let last=index==n;
let mut state_index=0; let mut newset=StateSet{states:vec![]};
while state_index < self.sets[index].states.len()
{
let state=self.sets[index].states[state_index].clone();
if state.finished()
{
let token=state.left;
let x=state.original_set;
let mut priority=true; let mut ambiguity=vec![state.clone()];
for i in 0..self.sets[index].states.len()
{
if i==state_index
{
continue;
}
let other=&self.sets[index].states[i];
if token!=other.left || x!=other.original_set || !other.finished()
{
continue;
}
match Tables::table_priority(state.rule,other.rule)
{
None =>
{
ambiguity.push(other.clone());
if i<state_index
{
priority=false;
break;
}
},
Some(Ordering::Less) =>
{
priority=false;
break;
},
Some(Ordering::Equal) =>
{
match Tables::table_associativity(state.rule)
{
None =>
{
ambiguity.push(other.clone());
if i<state_index
{
priority=false;
break;
}
},
Some(Associativity::Left) => if state_index>i
{
priority=false;
break;
},
Some(Associativity::Right) => if state_index<i
{
priority=false;
break;
},
}
},
Some(Ordering::Greater) => continue,
};
}
if priority
{
let mut i=0;
while i<self.sets[x].states.len()
{
if let Some(t)=self.sets[x].states[i].try_next()
{
if token==t
{
let mut new=self.sets[x].states[i].advance();
new.kind=EarleyKind::Complete(state_index,x,i);
self.sets[index].states.push(new);
}
}
i+=1;
}
}
}
else
{
if Tables::table_terminal(state.next())
{
if !last && state.next()==Tables::to_usize(&self.tokens[index])
{
let mut new=state.advance();
new.kind=EarleyKind::Scan(index,state_index);
newset.states.push(new);
}
}
else
{
let token=state.next();
Tables::predict(self,index,state_index,token);
}
}
state_index+=1;
}
self.sets.push(newset);
}
if self.verbosity>=2 {
let total_states=self.sets.iter().map(|set|set.states.len()).sum::<usize>();
println!("gramatica: Total of {} states, filling {} bytes",total_states,total_states*std::mem::size_of::<State<T>>());
}
let mut values=vec![];
for state_index in 0..self.sets[n].states.len()
{
if self.sets[n].states[state_index].left==0 && self.sets[n].states[state_index].finished()
{
self.compute_value(n,state_index);
if self.sets[n].states[state_index].ambiguity_info.states.len()>1
{
if self.verbosity>=1 {
let a=&self.sets[n].states[state_index].ambiguity_info;
println!("gramatica: Ambiguity at index={} token={:?}",a.index,self.tokens[a.index-1]);
let (start,end)=self.tokens_range[a.index-1];
let showing_start= if start>100 {start-100} else {0};
let showing_end= if end+100>=self.source.len() {self.source.len()} else {start+100};
println!("BEFORE<{}>",self.source[showing_start..start].to_string());
println!("THEN<{}>",self.source[start..end].to_string());
println!("AFTER<{}>",self.source[end..showing_end].to_string());
}
return Err(ParsingError::Ambiguous);
}
values.push(self.sets[n].states[state_index].computed_value.clone());
}
}
if self.verbosity>= 2{ println!("gramatica: values={:?}",values); }
if values.len()==0
{
Err(ParsingError::NotInGrammar)
}
else if values.len()==1
{
Ok(values[0].take().unwrap())
}
else
{
Err(ParsingError::Ambiguous)
}
}
pub fn compute_value(&mut self, set_index:usize, state_index:usize)
{
let mut pending = vec![(false, set_index,state_index)];
while pending.len()>0
{
let (expanded, top_set, top_state) = pending.pop().unwrap();
let right_size = self.sets[top_set].states[top_state].right.len();
match self.sets[top_set].states[top_state].kind
{
EarleyKind::Complete(reduced,prev_set_index,prev_state_index) =>
{
if !expanded
{
pending.push( (true,top_set,top_state) );
pending.push( (false,top_set,reduced) );
pending.push( (false,prev_set_index,prev_state_index) );
continue;
}
let mut prop_ambiguity=None;
self.sets[top_set].states[top_state].values=
{
let mut values=std::mem::take(&mut self.sets[prev_set_index].states[prev_state_index].values);
if values.len() < right_size
{
values.resize(right_size,Default::default());
}
values[self.sets[top_set].states[top_state].position-1]=self.sets[top_set].states[reduced].computed_value.take().unwrap();
let prev=&self.sets[prev_set_index].states[prev_state_index];
let red=&self.sets[top_set].states[reduced];
if prev.ambiguity_info.states.len()>1
{
prop_ambiguity=Some(prev.ambiguity_info.clone());
}
else if red.ambiguity_info.states.len()>1
{
prop_ambiguity=Some(red.ambiguity_info.clone());
}
values
};
if let Some(a)=prop_ambiguity
{
self.sets[top_set].states[top_state].ambiguity_info=a;
}
},
EarleyKind::Scan(prev_set_index,prev_state_index) =>
{
if !expanded
{
pending.push( (true,top_set,top_state) );
pending.push( (false,prev_set_index,prev_state_index) );
continue;
}
let position = self.sets[top_set].states[top_state].position;
let mut values = std::mem::take(&mut self.sets[prev_set_index].states[prev_state_index].values);
if values.len() < right_size
{
values.resize(right_size,Default::default());
}
let token = self.tokens[top_set-1].clone();
values[position-1] = token;
self.sets[top_set].states[top_state].values = values;
self.sets[top_set].states[top_state].ambiguity_info=self.sets[prev_set_index].states[prev_state_index].ambiguity_info.clone();
},
EarleyKind::Predict(_harbinger_state_index) =>
{
},
}
let current=&mut self.sets[top_set].states[top_state];
if current.finished()
{
Tables::compute_value(current);
current.values.clear();
}
}
}
pub fn compute_value_recursive(&mut self, set_index:usize, state_index:usize)
{
match self.sets[set_index].states[state_index].kind
{
EarleyKind::Complete(reduced,prev_set_index,prev_state_index) =>
{
self.compute_value_recursive(set_index,reduced);
self.compute_value_recursive(prev_set_index,prev_state_index);
let mut prop_ambiguity=None;
self.sets[set_index].states[state_index].values=
{
let prev=&self.sets[prev_set_index].states[prev_state_index];
let red=&self.sets[set_index].states[reduced];
let mut current_values=prev.values.clone();
current_values[self.sets[set_index].states[state_index].position-1]=red.computed_value.clone().unwrap();
if prev.ambiguity_info.states.len()>1
{
prop_ambiguity=Some(prev.ambiguity_info.clone());
}
else if red.ambiguity_info.states.len()>1
{
prop_ambiguity=Some(red.ambiguity_info.clone());
}
current_values
};
if let Some(a)=prop_ambiguity
{
self.sets[set_index].states[state_index].ambiguity_info=a;
}
},
EarleyKind::Scan(prev_set_index,prev_state_index) =>
{
self.compute_value_recursive(prev_set_index,prev_state_index);
for i in 0..self.sets[set_index].states[state_index].position-1
{
self.sets[set_index].states[state_index].values[i]=self.sets[prev_set_index].states[prev_state_index].values[i].clone();
}
self.sets[set_index].states[state_index].ambiguity_info=self.sets[prev_set_index].states[prev_state_index].ambiguity_info.clone();
},
EarleyKind::Predict(_harbinger_state_index) =>
{
},
}
let current=&mut self.sets[set_index].states[state_index];
if current.finished()
{
Tables::compute_value(current);
}
}
}
#[cfg(test)]
mod tests {
#[test]
fn it_works() {
assert_eq!(2 + 2, 4);
}
}