1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458
//! Parser types for the LR parser.
//! The parser types are used during the parsing process.
//! Some are nearly duplicates of the ones in the `parol` crate which in turn duplicates the `lalr`
//! crate's types.
//! This is suboptimal but necessary to avoid a dependency to the `lalr` crate here.
use core::str;
use std::{cell::RefCell, collections::BTreeSet, convert::TryInto, rc::Rc};
use log::trace;
use crate::{
parser::parser_types::TreeBuilder, FileSource, LRParseTree, NonTerminalIndex, ParolError,
ParseTree, ParseTreeStack, ParseTreeType, ParserError, ProductionIndex, Result, SyntaxError,
TerminalIndex, TokenStream, TokenVec, UnexpectedToken, UserActionsTrait,
};
/// The type of the index of a LR action in the parse table's actions array.
pub type LRActionIndex = usize;
///
/// The type that contains all data to process a production within the lr-parser.
///
#[derive(Debug, Clone)]
pub struct LRProduction {
///
/// The non-terminal index of the symbol on the left-hand side of the
/// production.
/// It is used as index into the generated LOOKAHEAD_AUTOMATA array.
///
pub lhs: NonTerminalIndex,
///
/// The length of the right-hand side of the production.
///
pub len: usize,
}
/// An item in the LR(0) state machine.
/// Duplicate of the `lalr` crate's `Item` type without the reference to the creating grammar.
#[derive(Debug, Ord, PartialOrd, Eq, PartialEq)]
pub struct Item {
/// The production of the item.
pub prod: ProductionIndex,
/// The position in the production.
pub pos: usize,
}
/// An item set in the LR(0) state machine.
/// Duplicate of the `lalr` crate's `ItemSet` type without the reference to the creating grammar.
#[derive(Debug)]
pub struct ItemSet {
/// The items in the set.
pub items: BTreeSet<Item>,
}
/// A LALR(1) parse table action.
/// The action can be either a shift, reduce, or accept action.
/// Duplicate of the `lalr` crate's `LRAction` type without the reference to the creating grammar.
#[derive(Debug, PartialEq, Eq)]
pub enum LRAction {
/// Shift the current token and go to the next state.
Shift(usize),
/// Reduce to the given production.
Reduce(NonTerminalIndex, ProductionIndex),
/// Accept the input.
Accept,
}
/// A state in the LALR(1) parse table.
/// Duplicate of the `lalr` crate's `LR1State` type without the reference to the creating grammar.
#[derive(Debug)]
pub struct LR1State {
/// The actions to take for each terminal in the state.
pub actions: &'static [(TerminalIndex, LRActionIndex)],
/// The gotos to take for each non-terminal in the state.
pub gotos: &'static [(NonTerminalIndex, usize)],
}
impl LR1State {
/// Returns the action index for the given terminal index.
/// If the terminal index is not found in the state, `None` is returned.
pub fn action_index(&self, terminal_index: TerminalIndex) -> Option<LRActionIndex> {
self.actions
.iter()
.find(|(t, _)| *t == terminal_index)
.map(|(_, a)| *a)
}
/// Returns the goto state for the given non-terminal index.
/// If the non-terminal index is not found in the state, `None` is returned.
pub fn goto_state(&self, non_terminal_index: NonTerminalIndex) -> Option<usize> {
self.gotos
.iter()
.find(|(nt, _)| *nt == non_terminal_index)
.map(|(_, g)| *g)
}
/// Returns a list of all terminal indices in the state.
/// The list is displayed in case of an error.
pub fn viable_terminal_indices(&self) -> Vec<TerminalIndex> {
self.actions.iter().map(|(t, _)| *t).collect()
}
}
/// The LALR(1) parse table.
#[derive(Debug)]
pub struct LRParseTable {
/// The actions used in the parse table.
pub actions: &'static [LRAction],
/// The states in the parse table.
pub states: &'static [LR1State],
}
impl LRParseTable {
/// Returns the action for the given state and terminal index.
/// If the terminal index is not found in the state, `None` is returned.
pub fn action(&self, state: usize, terminal_index: TerminalIndex) -> Option<&LRAction> {
let state = &self.states[state];
state.action_index(terminal_index).map(|a| &self.actions[a])
}
/// Returns the goto state for the given state and non-terminal index.
/// If the non-terminal index is not found in the state, `None` is returned.
pub fn goto(&self, state: usize, non_terminal_index: NonTerminalIndex) -> Option<usize> {
self.states[state].goto_state(non_terminal_index)
}
/// Returns a list of all terminal indices in the state.
/// The list is displayed in case of an error.
pub fn viable_terminal_indices(&self, state: usize) -> Vec<TerminalIndex> {
self.states[state].viable_terminal_indices()
}
}
/// The LR parse stack.
#[derive(Debug, Default)]
pub struct LRParseStack {
/// The state indices from in the parse table.
pub stack: Vec<usize>,
}
impl LRParseStack {
/// Creates a new instance.
/// The stack is initialized with the start state.
pub fn new() -> Self {
Self { stack: vec![0] }
}
/// Returns the current state.
/// The current state is the top of the stack.
/// The stack is never empty.
pub fn current_state(&self) -> usize {
*self.stack.last().unwrap()
}
/// Pushes a new state onto the stack.
pub fn push(&mut self, state: usize) {
self.stack.push(state);
}
/// Pops the top state from the stack.
/// The stack is never empty.
/// The start state is never popped.
pub fn pop(&mut self) {
self.stack.pop();
}
}
///
/// The LR parser.
/// It implements a LALR(1) parsing strategy.
/// All data of the generated parser are provided in the 'new' function.
///
/// The lifetime parameter `'t` refers to the lifetime of the scanned text.
///
#[derive(Debug)]
pub struct LRParser<'t> {
///
/// The non-terminal index of the start symbol
///
start_symbol_index: NonTerminalIndex,
/// The parse table.
pub parse_table: &'static LRParseTable,
/// Temporary stack that receives recognized grammar symbols before they
/// are added to the parse tree.
///
/// This stack is also used to provide arguments to semantic user actions.
parse_tree_stack: ParseTreeStack<LRParseTree<'t>>,
/// The stack of the parser.
parser_stack: LRParseStack,
///
/// The array of generated grammar productions.
///
productions: &'static [LRProduction],
///
/// Array of generated terminal names.
///
terminal_names: &'static [&'static str],
///
/// Array of generated non-terminal names.
///
non_terminal_names: &'static [&'static str],
/// Enables trimming of the parse tree during parsing.
/// Thus the parse tree doesn't grow much and runtime overhead is diminished.
/// Useful when enabling production mode and the whole parse tree is not needed.
///
/// To enable this call the method `trim_parse_tree` on the parser object before parsing.
///
/// Default is `false`.
trim_parse_tree: bool,
}
impl<'t> LRParser<'t> {
///
/// Creates a new LR parser.
///
pub fn new(
start_symbol_index: NonTerminalIndex,
parse_table: &'static LRParseTable,
productions: &'static [LRProduction],
terminal_names: &'static [&'static str],
non_terminal_names: &'static [&'static str],
) -> Self {
LRParser {
start_symbol_index,
parse_table,
parse_tree_stack: ParseTreeStack::new(),
parser_stack: LRParseStack::new(),
productions,
terminal_names,
non_terminal_names,
trim_parse_tree: false,
}
}
///
/// Trims the parse tree during parsing.
/// Thus the parse tree doesn't grow much and runtime overhead is diminished.
/// Useful when enabling production mode and the whole parse tree is not needed.
///
pub fn trim_parse_tree(&mut self) {
self.trim_parse_tree = true;
}
fn call_action<'u>(
&mut self,
prod_num: ProductionIndex,
user_actions: &'u mut dyn UserActionsTrait<'t>,
) -> Result<usize> {
// Calculate the number of symbols in the production
let n = self.productions[prod_num].len;
// We remove the last n entries from the parse tree stack
let children: Vec<LRParseTree<'_>> = self
.parse_tree_stack
.split_off(self.parse_tree_stack.len() - n);
// Prepare the arguments for the user's semantic action
let arguments = children
.iter()
.map(|pt| pt.into())
.collect::<Vec<ParseTreeType<'t>>>();
// Insert children under the new non-terminal node of the production being reduced
let non_terminal = LRParseTree::NonTerminal(
self.non_terminal_names[self.productions[prod_num].lhs],
if self.trim_parse_tree {
None
} else {
Some(children)
},
);
// Push the new non-terminal node onto the parse tree stack
self.parse_tree_stack.push(non_terminal);
// With the argument built from children we can call the user's semantic action
trace!("Call semantic action for production {}", prod_num);
user_actions.call_semantic_action_for_production_number(prod_num, &arguments)?;
Ok(n)
}
fn handle_comments<'u>(
&mut self,
stream: &RefCell<TokenStream<'t>>,
user_actions: &'u mut dyn UserActionsTrait<'t>,
) -> Result<()> {
stream
.borrow_mut()
.drain_comments()
.into_iter()
.for_each(|c| user_actions.on_comment_parsed(c));
Ok(())
}
///
/// Parses the input text.
///
pub fn parse<'u>(
&mut self,
stream: TokenStream<'t>,
user_actions: &'u mut dyn UserActionsTrait<'t>,
) -> Result<ParseTree<'t>> {
let stream = Rc::new(RefCell::new(stream));
// Initialize the parse stack and the parse tree stack.
self.parser_stack = LRParseStack::new();
loop {
self.handle_comments(&stream, user_actions)?;
let terminal_index = stream.borrow_mut().lookahead_token_type(0)?;
let current_state = self.parser_stack.current_state();
trace!(
"Current state: {}, token type: {} ({})",
current_state,
terminal_index,
self.terminal_names[terminal_index as usize]
);
// Get the action for the current state and the current terminal
let action = self.parse_table.action(current_state, terminal_index);
match action {
Some(action) => {
match action {
LRAction::Shift(next_state) => {
// Consume the token
let token = stream.borrow_mut().consume()?;
trace!("Shift to state {}", next_state);
self.parser_stack.push(*next_state);
trace!(
"Push token {} ({})",
token.text,
self.terminal_names[token.token_type as usize]
);
let token = LRParseTree::Terminal(token.clone());
self.parse_tree_stack.push(token);
}
LRAction::Reduce(nt_index, prod_index) => {
trace!("Reduce by production {}", prod_index);
let nt_index = *nt_index;
let n = self.call_action(*prod_index, user_actions)?;
for _ in 0..n {
// Pop n states from the stack
if self.parser_stack.stack.is_empty() {
return Err(ParserError::InternalError(
"Attempted to pop from an empty stack".to_owned(),
)
.into());
}
self.parser_stack.pop();
}
// The new state is the one on top of the stack
let state = self.parser_stack.current_state();
trace!("Current state after removing {} states is {}", n, state);
let goto = match self.parse_table.goto(state, nt_index) {
Some(goto) => goto,
None => {
return Err(ParserError::InternalError(format!(
"No goto for non-terminal '{}' in state {}",
nt_index, state
))
.into());
}
};
// Push the new state onto the stack
trace!("Push goto state {}", goto);
self.parser_stack.push(goto);
}
LRAction::Accept => {
trace!("Accept");
// The non-terminal of the start symbol lies on top of the stack here
trace!("Final parse stack: {:?}", self.parser_stack.stack);
trace!("Final parse tree stack:\n{}", self.parse_tree_stack);
// Find the production number of the start symbol
let prod_index = if let Some(index) = self
.productions
.iter()
.position(|p| p.lhs == self.start_symbol_index)
{
index
} else {
return Err(ParserError::InternalError(format!(
"No production found for start symbol '{}'",
self.non_terminal_names[self.start_symbol_index]
))
.into());
};
// Call the action for the start symbol
let _n = self.call_action(prod_index, user_actions)?;
break;
}
}
}
None => {
self.handle_parse_error(&stream, current_state, terminal_index)?;
}
}
}
let parse_tree = if self.trim_parse_tree {
// Return an empty parse tree
TreeBuilder::new().build()
} else {
// The parse tree stack should contain only one element at this point
debug_assert!(self.parse_tree_stack.len() == 1);
let parse_tree = self.parse_tree_stack.pop().unwrap();
parse_tree.try_into()
};
Ok(parse_tree.map_err(|source| ParserError::TreeError { source })?)
}
fn handle_parse_error(
&mut self,
stream: &Rc<RefCell<TokenStream<'t>>>,
current_state: usize,
terminal_index: u16,
) -> Result<()> {
let token = stream.borrow_mut().lookahead(0)?;
trace!("No action for token '{}' in state {}", token, current_state);
trace!("Current scanner is '{}'", stream.borrow().current_scanner());
trace!("Parse stack: {:?}", self.parser_stack.stack);
trace!("Parse tree stack:\n{}", self.parse_tree_stack);
let entries = vec![SyntaxError {
cause: format!(
"No action for token '{}' in state {}\nCurrent scanner is '{}'",
self.terminal_names[terminal_index as usize],
current_state,
stream.borrow().current_scanner()
),
input: Some(Box::new(FileSource::from_stream(&stream.borrow()))),
error_location: Box::new((&token).into()),
unexpected_tokens: vec![UnexpectedToken::new(
"LA(1)".to_owned(),
self.terminal_names[terminal_index as usize].to_owned(),
&token,
)],
expected_tokens: self
.parse_table
.viable_terminal_indices(current_state)
.iter()
.fold(TokenVec::new(), |mut acc, t| {
acc.push(self.terminal_names[*t as usize].to_owned());
acc
}),
source: None,
}];
Err(ParolError::ParserError(ParserError::SyntaxErrors {
entries,
}))
}
}