use std::collections::{HashMap, HashSet};
use std::sync::Arc;
use crate::debug_trace;
use crate::logic::grammar::{Grammar, Production, Segment, Symbol};
use crate::logic::partial::structure::{
PackedAlternative, PartialAST, SppfChild, SppfForest, SppfNodeId, SppfNodeKey, Terminal,
};
use crate::logic::segment::SegmentRange;
use crate::regex::{PrefixStatus, Regex as DerivativeRegex};
fn prng_shuffle(n: usize, level: usize) -> Vec<usize> {
if n == 0 {
return Vec::new();
}
(0..n).map(|i| (i + level) % n).collect()
}
const DEFAULT_MAX_RECURSION_DEPTH: usize = 15;
#[derive(Debug, Clone)]
pub enum PartialParseOutcome {
Success { ast: PartialAST },
Failure(ParseError),
}
#[derive(Debug, Clone, PartialEq)]
pub enum ParseError {
Tokenization(String),
NoStartSymbol,
NoValidParse,
DepthLimit,
}
impl std::fmt::Display for ParseError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
ParseError::Tokenization(e) => write!(f, "Tokenization error: {}", e),
ParseError::NoStartSymbol => write!(f, "Grammar has no start symbol"),
ParseError::NoValidParse => write!(f, "No valid parse alternatives found"),
ParseError::DepthLimit => write!(f, "Recursion depth limit reached"),
}
}
}
impl std::error::Error for ParseError {}
impl PartialParseOutcome {
pub fn is_complete(&self) -> bool {
match self {
Self::Success { ast } => ast.is_complete(),
_ => false,
}
}
pub fn is_success(&self) -> bool {
match self {
Self::Success { .. } => true,
_ => false,
}
}
pub fn into_result(self) -> Result<PartialAST, ParseError> {
match self {
Self::Success { ast } if !ast.is_empty() => Ok(ast),
Self::Success { .. } => Err(ParseError::NoValidParse),
Self::Failure(e) => Err(e),
}
}
pub fn ast(&self) -> Option<&PartialAST> {
match self {
Self::Success { ast } => Some(ast),
_ => None,
}
}
pub fn unwrap(self) -> PartialAST {
match self {
Self::Success { ast } if !ast.is_empty() => ast,
Self::Success { .. } => panic!("Called unwrap on Success with 0 roots"),
Self::Failure(e) => panic!("Called unwrap on Failure: {}", e),
}
}
pub fn expect(self, msg: &str) -> PartialAST {
match self {
Self::Success { ast } if !ast.is_empty() => ast,
_ => panic!("{}", msg),
}
}
pub fn is_ok(&self) -> bool {
self.is_success()
}
pub fn unwrap_err(self) -> ParseError {
match self {
Self::Failure(e) => e,
Self::Success { .. } => panic!("Called unwrap_err on Success"),
}
}
}
struct ParseState {
visited: HashMap<(String, usize), usize>,
hit_depth_limit: bool,
memo: HashMap<(String, usize, usize), Vec<ParsedNt>>,
in_progress: HashSet<(String, usize, usize)>,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
struct ParsedNt {
node_id: SppfNodeId,
consumed: usize,
complete: bool,
}
#[derive(Debug, Clone)]
struct ParsedChild {
child: SppfChild,
consumed: usize,
complete: bool,
}
impl ParseState {
fn new() -> Self {
Self {
visited: HashMap::new(),
hit_depth_limit: false,
memo: HashMap::new(),
in_progress: HashSet::new(),
}
}
}
impl Segment {
pub fn seg_range(&self) -> SegmentRange {
SegmentRange::single(self.index)
}
}
pub struct Parser {
pub(crate) grammar: Grammar,
reserved_tokens: HashSet<String>,
max_recursion: usize,
last_hit_depth_limit: bool,
}
impl Parser {
pub fn new(grammar: Grammar) -> Self {
let reserved_tokens: HashSet<String> = grammar.special_tokens.iter().cloned().collect();
Self {
grammar,
reserved_tokens,
max_recursion: DEFAULT_MAX_RECURSION_DEPTH,
last_hit_depth_limit: false,
}
}
pub fn with_max_recursion(mut self, depth: usize) -> Self {
self.max_recursion = depth;
self
}
pub fn set_max_recursion(&mut self, depth: usize) {
self.max_recursion = depth;
}
pub fn last_hit_depth_limit(&self) -> bool {
self.last_hit_depth_limit
}
pub fn clear_cache(&mut self) {}
pub fn parse(&mut self, input: &str) -> Result<PartialAST, String> {
match self.partial(input) {
PartialParseOutcome::Success { ast, .. } => {
let segments = self.tokenize(input).map_err(|e| e.to_string())?;
let total_segments = segments.len();
let complete_root = ast.root_ids().iter().any(|root_id| {
ast.forest().consumed_segments(*root_id) == total_segments
&& ast.forest().node_is_complete(*root_id)
});
if complete_root {
Ok(ast)
} else {
Err(format!(
"Parse error: no complete parse found consuming all {} tokens",
total_segments
))
}
}
PartialParseOutcome::Failure(e) => Err(e.to_string()),
}
}
pub fn partial(&mut self, input: &str) -> PartialParseOutcome {
self.last_hit_depth_limit = false;
debug_trace!("parser2 ", "Starting parse of input: '{}'", input);
let outcome = (|| {
let segments = match self.tokenize(input) {
Ok(s) => s,
Err(e) => return PartialParseOutcome::Failure(ParseError::Tokenization(e)),
};
debug_trace!("parser2 ", "Tokenized into {:?}", segments);
let start_nt = match self.grammar.start_nonterminal() {
Some(s) => s.to_string(),
None => return PartialParseOutcome::Failure(ParseError::NoStartSymbol),
};
debug_trace!("parser2 ", "Start nonterminal: {}", start_nt);
let mut parse_state = ParseState::new();
let mut forest = SppfForest::new();
let roots = match self.parse_nonterminal(
&segments,
&start_nt,
None,
0,
0,
&mut parse_state,
&mut forest,
) {
Ok(r) => r,
Err(e) => {
return PartialParseOutcome::Failure(ParseError::Tokenization(e));
}
};
let total_segments = segments.len();
let depth_limited = parse_state.hit_depth_limit;
self.last_hit_depth_limit = depth_limited;
let valid_roots: Vec<SppfNodeId> = roots
.into_iter()
.filter(|r| r.consumed == total_segments)
.map(|r| r.node_id)
.collect();
if valid_roots.is_empty() {
if depth_limited {
return PartialParseOutcome::Failure(ParseError::DepthLimit);
}
return PartialParseOutcome::Failure(ParseError::NoValidParse);
}
PartialParseOutcome::Success {
ast: PartialAST::from_forest(forest, valid_roots, input.to_string()),
}
})();
outcome
}
fn tokenize(&self, input: &str) -> Result<Vec<Segment>, String> {
self.grammar.tokenize(input)
}
fn parse_nonterminal(
&mut self,
segments: &[Segment],
nt_name: &str,
binding: Option<String>,
abs_pos: usize,
level: usize,
parse_state: &mut ParseState,
forest: &mut SppfForest,
) -> Result<Vec<ParsedNt>, String> {
let indent = " ".repeat(level);
debug_trace!(
"parser2 ",
"{}[L{}] Parsing nonterminal '{}' at abs_pos {}",
indent,
level,
nt_name,
abs_pos
);
if level > self.max_recursion {
debug_trace!(
"parser2 ",
"{}[L{}] Termination: Global depth limit exceeded (> {})",
indent,
level,
self.max_recursion
);
parse_state.hit_depth_limit = true;
return Ok(Vec::new());
}
let key = (nt_name.to_string(), abs_pos);
let memo_key = (nt_name.to_string(), abs_pos, segments.len());
if !parse_state.in_progress.contains(&memo_key) {
if let Some(cached) = parse_state.memo.get(&memo_key) {
return Ok(cached.clone());
}
}
if let Some(count) = parse_state.visited.get(&key) {
let local_limit = self.max_recursion.min(segments.len().saturating_add(8));
debug_trace!(
"parser2 ",
"{}[L{}] Recursion detected for '{}' at abs_pos {} depth {}",
indent,
level,
nt_name,
abs_pos,
count
);
if *count >= local_limit {
debug_trace!(
"parser2 ",
"{}[L{}] Termination: Too much recursion (>= {}, remaining segments: {})",
indent,
level,
local_limit,
segments.len()
);
parse_state.hit_depth_limit = true;
return Ok(Vec::new());
} else {
debug_trace!(
"parser2 ",
"{}[L{}] Continuing recursion (count: {})",
indent,
level,
count + 1
);
parse_state.visited.insert(key.clone(), count + 1);
}
} else {
debug_trace!(
"parser2 ",
"{}[L{}] First parse attempt for '{}' at abs_pos {}",
indent,
level,
nt_name,
abs_pos
);
parse_state.visited.insert(key.clone(), 1);
}
parse_state.in_progress.insert(memo_key.clone());
let productions_len = self
.grammar
.productions
.get(nt_name)
.ok_or_else(|| format!("No productions for nonterminal '{}'", nt_name))?
.len();
let shuffled_indices = prng_shuffle(productions_len, level);
let mut outcomes = Vec::new();
let mut seen = HashSet::new();
for &alt_idx in &shuffled_indices {
let prod = self
.grammar
.productions
.get(nt_name)
.and_then(|ps| ps.get(alt_idx))
.ok_or_else(|| {
format!(
"No production index {} for nonterminal '{}'",
alt_idx, nt_name
)
})?
.clone();
let prod_ref = Arc::new(prod);
debug_trace!(
"parser2 ",
"{}[L{}] Trying production {}@{}: {} on {}",
indent,
level,
nt_name,
alt_idx,
prod_ref,
segments
.iter()
.map(|s| s.text())
.collect::<Vec<String>>()
.join(" ")
);
match self.parse_production(segments, &prod_ref, abs_pos, level, parse_state, forest) {
Ok(prod_outcomes) => {
if prod_outcomes.is_empty() {
debug_trace!(
"parser2 ",
"{}[L{}] Production {}@{} produced no results",
indent,
level,
nt_name,
alt_idx
);
continue;
} else {
debug_trace!(
"parser2 ",
"{}[L{}] Production {}@{} succeeded with {} parse sequences",
indent,
level,
nt_name,
alt_idx,
prod_outcomes.len()
);
for children in prod_outcomes {
let consumed = self.count_consumed_segments(&children);
let complete =
self.children_are_complete(&children, prod_ref.rhs.len());
let key = SppfNodeKey {
name: nt_name.to_string(),
binding: binding.clone(),
abs_pos,
consumed_segments: consumed,
};
let node_id = forest.intern_node(key);
forest.add_alternative(
node_id,
PackedAlternative {
alternative_index: alt_idx,
production: Arc::clone(&prod_ref),
children: children.into_iter().map(|c| c.child).collect(),
},
);
let out = ParsedNt {
node_id,
consumed,
complete,
};
if seen.insert(out.clone()) {
outcomes.push(out);
}
}
}
}
Err(e) => {
debug_trace!(
"parser2 ",
"{}[L{}] Production {}@{} failed: {}",
indent,
level,
nt_name,
alt_idx,
e
);
}
}
}
parse_state.visited.remove(&key);
parse_state.in_progress.remove(&memo_key);
debug_trace!(
"parser2 ",
"{}[L{}] Finished parsing nonterminal '{}': {} trees",
indent,
level,
nt_name,
outcomes.len()
);
if !parse_state.hit_depth_limit {
parse_state.memo.insert(memo_key, outcomes.clone());
}
Ok(outcomes)
}
fn parse_production(
&mut self,
segments: &[Segment],
prod: &Production,
abs_pos: usize,
level: usize,
parse_state: &mut ParseState,
forest: &mut SppfForest,
) -> Result<Vec<Vec<ParsedChild>>, String> {
let indent = " ".repeat(level);
debug_trace!(
"parser2.prod ",
"{}[L{}] Parsing production: {:?}",
indent,
level,
prod
);
if prod.rhs.is_empty() {
debug_trace!(
"parser2.prod ",
"{}[L{}] Epsilon production matched",
indent,
level
);
return Ok(vec![vec![]]);
}
self.parse_symbols(segments, &prod.rhs, abs_pos, level, parse_state, forest)
}
fn parse_symbols(
&mut self,
segments: &[Segment],
symbols: &[Symbol],
abs_pos: usize,
level: usize,
parse_state: &mut ParseState,
forest: &mut SppfForest,
) -> Result<Vec<Vec<ParsedChild>>, String> {
if symbols.is_empty() {
return Ok(vec![vec![]]);
}
let first_sym = &symbols[0];
let rest_syms = &symbols[1..];
let first_parses =
self.parse_symbol(segments, first_sym, abs_pos, level, parse_state, forest)?;
if first_parses.is_empty() {
return Ok(Vec::new());
}
let mut outcomes = Vec::with_capacity(first_parses.len());
let mut rest_cache: HashMap<usize, Vec<Vec<ParsedChild>>> = HashMap::new();
for node in first_parses {
let consumed: usize = node.consumed;
if !node.complete {
if consumed == segments.len() {
outcomes.push(vec![node]);
}
continue;
}
let remaining_segments = segments.get(consumed..).unwrap_or(&[]);
let new_abs_pos = abs_pos + consumed;
let rest_parses = if let Some(cached) = rest_cache.get(&consumed) {
cached.clone()
} else {
let parsed = self.parse_symbols(
remaining_segments,
rest_syms,
new_abs_pos,
level,
parse_state,
forest,
)?;
rest_cache.insert(consumed, parsed.clone());
parsed
};
for mut rest_nodes in rest_parses {
let mut full_parse = vec![node.clone()];
full_parse.append(&mut rest_nodes);
outcomes.push(full_parse);
}
}
Ok(outcomes)
}
fn count_consumed_segments(&self, nodes: &[ParsedChild]) -> usize {
nodes.iter().map(|n| n.consumed).sum()
}
fn children_are_complete(&self, nodes: &[ParsedChild], rhs_len: usize) -> bool {
nodes.len() == rhs_len && nodes.iter().all(|n| n.complete)
}
fn parse_symbol(
&mut self,
segments: &[Segment],
symbol: &Symbol,
abs_pos: usize,
level: usize,
parse_state: &mut ParseState,
forest: &mut SppfForest,
) -> Result<Vec<ParsedChild>, String> {
let res = match symbol {
Symbol::Terminal { regex, binding } => {
self.parse_regex(segments, regex, binding.clone(), level)
}
Symbol::Nonterminal { name, binding } => {
let nts = self.parse_nonterminal(
segments,
name,
binding.clone(),
abs_pos,
level + 1,
parse_state,
forest,
)?;
Ok(nts
.into_iter()
.map(|nt| ParsedChild {
child: SppfChild::Node(nt.node_id),
consumed: nt.consumed,
complete: nt.complete,
})
.collect())
}
};
res
}
fn parse_regex(
&self,
segments: &[Segment],
re: &DerivativeRegex,
binding: Option<String>,
level: usize,
) -> Result<Vec<ParsedChild>, String> {
if segments.is_empty() {
debug_trace!(
"parser2.regex",
"{}[L{}] At end of input, returning partial terminal",
" ".repeat(level),
level
);
let node = ParsedChild {
child: SppfChild::Terminal(Terminal::Partial {
value: String::new(),
binding: binding.clone(),
remainder: Some(re.clone()),
}),
consumed: 0,
complete: false,
};
return Ok(vec![node]);
}
let seg = &segments[0];
let seg_text = seg.as_str();
if self.reserved_tokens.contains(seg_text) && !re.equiv(&DerivativeRegex::literal(seg_text))
{
return Ok(vec![]);
}
let indent = " ".repeat(level);
debug_trace!(
"parser2.regex",
"{}[L{}] Trying regex '{}' against segment '{}'",
indent,
level,
re.to_pattern(),
seg_text
);
let node = match re.prefix_match(seg_text) {
PrefixStatus::Complete => {
debug_trace!(
"parser2.regex",
"{}[L{}] Regex FULL match for segment '{}'",
indent,
level,
seg_text
);
Some(ParsedChild {
child: SppfChild::Terminal(Terminal::Complete {
value: seg_text.to_string(),
binding: binding.clone(),
extension: None,
}),
consumed: 1,
complete: true,
})
}
PrefixStatus::Prefix(derivative) => {
debug_trace!(
"parser2.regex",
"{}[L{}] Regex PARTIAL match for segment '{}'",
indent,
level,
seg_text
);
Some(ParsedChild {
child: SppfChild::Terminal(Terminal::Partial {
value: seg_text.to_string(),
binding: binding.clone(),
remainder: Some(derivative.clone()),
}),
consumed: 1,
complete: false,
})
}
PrefixStatus::Extensible(derivative) => {
debug_trace!(
"parser2.regex",
"{}[L{}] Regex EXTENSIBLE match for segment '{}'",
indent,
level,
seg_text
);
Some(ParsedChild {
child: SppfChild::Terminal(Terminal::Complete {
value: seg_text.to_string(),
binding: binding.clone(),
extension: Some(derivative.clone()),
}),
consumed: 1,
complete: true,
})
}
PrefixStatus::NoMatch => {
debug_trace!(
"parser2.regex",
"{}[L{}] Regex NO match for segment '{}'",
indent,
level,
seg_text
);
None
}
};
Ok(node.into_iter().collect())
}
}