use std::{
collections::{hash_map::Entry, HashMap, VecDeque},
mem,
};
use log::debug;
use super::{coincident_tokens::CoincidentTokenIndex, token_conflicts::TokenConflictMap};
use crate::{
dedup::split_state_id_groups,
grammars::{LexicalGrammar, SyntaxGrammar},
nfa::{CharacterSet, NfaCursor},
rules::{Symbol, TokenSet},
tables::{AdvanceAction, LexState, LexTable, ParseStateId, ParseTable},
};
pub const LARGE_CHARACTER_RANGE_COUNT: usize = 8;
pub struct LexTables {
pub main_lex_table: LexTable,
pub keyword_lex_table: LexTable,
pub large_character_sets: Vec<(Option<Symbol>, CharacterSet)>,
}
pub fn build_lex_table(
parse_table: &mut ParseTable,
syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
keywords: &TokenSet,
coincident_token_index: &CoincidentTokenIndex,
token_conflict_map: &TokenConflictMap,
) -> LexTables {
let keyword_lex_table = if syntax_grammar.word_token.is_some() {
let mut builder = LexTableBuilder::new(lexical_grammar);
builder.add_state_for_tokens(keywords);
builder.table
} else {
LexTable::default()
};
let mut parse_state_ids_by_token_set = Vec::<(TokenSet, Vec<ParseStateId>)>::new();
for (i, state) in parse_table.states.iter().enumerate() {
let tokens = state
.terminal_entries
.keys()
.copied()
.chain(state.reserved_words.iter())
.filter_map(|token| {
if token.is_terminal() {
if keywords.contains(&token) {
syntax_grammar.word_token
} else {
Some(token)
}
} else if token.is_eof() {
Some(token)
} else {
None
}
})
.collect();
let mut did_merge = false;
for entry in &mut parse_state_ids_by_token_set {
if merge_token_set(
&mut entry.0,
&tokens,
lexical_grammar,
token_conflict_map,
coincident_token_index,
) {
did_merge = true;
entry.1.push(i);
break;
}
}
if !did_merge {
parse_state_ids_by_token_set.push((tokens, vec![i]));
}
}
let mut builder = LexTableBuilder::new(lexical_grammar);
for (tokens, parse_state_ids) in parse_state_ids_by_token_set {
let lex_state_id = builder.add_state_for_tokens(&tokens);
for id in parse_state_ids {
parse_table.states[id].lex_state_id = lex_state_id;
}
}
let mut main_lex_table = mem::take(&mut builder.table);
minimize_lex_table(&mut main_lex_table, parse_table);
sort_states(&mut main_lex_table, parse_table);
let mut large_character_sets = Vec::new();
for (variable_ix, _variable) in lexical_grammar.variables.iter().enumerate() {
let symbol = Symbol::terminal(variable_ix);
builder.reset();
builder.add_state_for_tokens(&TokenSet::from_iter([symbol]));
for state in &builder.table.states {
let mut characters = CharacterSet::empty();
for (chars, action) in &state.advance_actions {
if action.in_main_token {
characters = characters.add(chars);
continue;
}
if chars.range_count() > LARGE_CHARACTER_RANGE_COUNT
&& !large_character_sets.iter().any(|(_, set)| set == chars)
{
large_character_sets.push((None, chars.clone()));
}
}
if characters.range_count() > LARGE_CHARACTER_RANGE_COUNT
&& !large_character_sets
.iter()
.any(|(_, set)| *set == characters)
{
large_character_sets.push((Some(symbol), characters));
}
}
}
LexTables {
main_lex_table,
keyword_lex_table,
large_character_sets,
}
}
struct QueueEntry {
state_id: usize,
nfa_states: Vec<u32>,
eof_valid: bool,
}
struct LexTableBuilder<'a> {
lexical_grammar: &'a LexicalGrammar,
cursor: NfaCursor<'a>,
table: LexTable,
state_queue: VecDeque<QueueEntry>,
state_ids_by_nfa_state_set: HashMap<(Vec<u32>, bool), usize>,
}
impl<'a> LexTableBuilder<'a> {
fn new(lexical_grammar: &'a LexicalGrammar) -> Self {
Self {
lexical_grammar,
cursor: NfaCursor::new(&lexical_grammar.nfa, vec![]),
table: LexTable::default(),
state_queue: VecDeque::new(),
state_ids_by_nfa_state_set: HashMap::new(),
}
}
fn reset(&mut self) {
self.table = LexTable::default();
self.state_queue.clear();
self.state_ids_by_nfa_state_set.clear();
}
fn add_state_for_tokens(&mut self, tokens: &TokenSet) -> usize {
let mut eof_valid = false;
let nfa_states = tokens
.iter()
.filter_map(|token| {
if token.is_terminal() {
Some(self.lexical_grammar.variables[token.index].start_state)
} else {
eof_valid = true;
None
}
})
.collect();
let (state_id, is_new) = self.add_state(nfa_states, eof_valid);
if is_new {
debug!(
"entry point state: {state_id}, tokens: {:?}",
tokens
.iter()
.map(|t| &self.lexical_grammar.variables[t.index].name)
.collect::<Vec<_>>()
);
}
while let Some(QueueEntry {
state_id,
nfa_states,
eof_valid,
}) = self.state_queue.pop_front()
{
self.populate_state(state_id, nfa_states, eof_valid);
}
state_id
}
fn add_state(&mut self, nfa_states: Vec<u32>, eof_valid: bool) -> (usize, bool) {
self.cursor.reset(nfa_states);
match self
.state_ids_by_nfa_state_set
.entry((self.cursor.state_ids.clone(), eof_valid))
{
Entry::Occupied(o) => (*o.get(), false),
Entry::Vacant(v) => {
let state_id = self.table.states.len();
self.table.states.push(LexState::default());
self.state_queue.push_back(QueueEntry {
state_id,
nfa_states: v.key().0.clone(),
eof_valid,
});
v.insert(state_id);
(state_id, true)
}
}
}
fn populate_state(&mut self, state_id: usize, nfa_states: Vec<u32>, eof_valid: bool) {
self.cursor.force_reset(nfa_states);
let mut completion = None;
for (id, prec) in self.cursor.completions() {
if let Some((prev_id, prev_precedence)) = completion {
if TokenConflictMap::prefer_token(
self.lexical_grammar,
(prev_precedence, prev_id),
(prec, id),
) {
continue;
}
}
completion = Some((id, prec));
}
let transitions = self.cursor.transitions();
let has_sep = self.cursor.transition_chars().any(|(_, sep)| sep);
if eof_valid {
let (next_state_id, _) = self.add_state(Vec::new(), false);
self.table.states[state_id].eof_action = Some(AdvanceAction {
state: next_state_id,
in_main_token: true,
});
}
for transition in transitions {
if let Some((completed_id, completed_precedence)) = completion {
if !TokenConflictMap::prefer_transition(
self.lexical_grammar,
&transition,
completed_id,
completed_precedence,
has_sep,
) {
continue;
}
}
let (next_state_id, _) =
self.add_state(transition.states, eof_valid && transition.is_separator);
self.table.states[state_id].advance_actions.push((
transition.characters,
AdvanceAction {
state: next_state_id,
in_main_token: !transition.is_separator,
},
));
}
if let Some((complete_id, _)) = completion {
self.table.states[state_id].accept_action = Some(Symbol::terminal(complete_id));
} else if self.cursor.state_ids.is_empty() {
self.table.states[state_id].accept_action = Some(Symbol::end());
}
}
}
fn merge_token_set(
tokens: &mut TokenSet,
other: &TokenSet,
lexical_grammar: &LexicalGrammar,
token_conflict_map: &TokenConflictMap,
coincident_token_index: &CoincidentTokenIndex,
) -> bool {
for i in 0..lexical_grammar.variables.len() {
let symbol = Symbol::terminal(i);
let set_without_terminal = match (tokens.contains_terminal(i), other.contains_terminal(i)) {
(true, false) => other,
(false, true) => tokens,
_ => continue,
};
for existing_token in set_without_terminal.terminals() {
if token_conflict_map.does_conflict(i, existing_token.index)
|| token_conflict_map.does_match_prefix(i, existing_token.index)
{
return false;
}
if !coincident_token_index.contains(symbol, existing_token)
&& (token_conflict_map.does_overlap(existing_token.index, i)
|| token_conflict_map.does_overlap(i, existing_token.index))
{
return false;
}
}
}
tokens.insert_all(other);
true
}
fn minimize_lex_table(table: &mut LexTable, parse_table: &mut ParseTable) {
let mut state_ids_by_signature = HashMap::new();
for (i, state) in table.states.iter().enumerate() {
let signature = (
i == 0,
state.accept_action,
state.eof_action.is_some(),
state
.advance_actions
.iter()
.map(|(characters, action)| (characters.clone(), action.in_main_token))
.collect::<Vec<_>>(),
);
state_ids_by_signature
.entry(signature)
.or_insert(Vec::new())
.push(i);
}
let mut state_ids_by_group_id = state_ids_by_signature
.into_iter()
.map(|e| e.1)
.collect::<Vec<_>>();
state_ids_by_group_id.sort();
let error_group_index = state_ids_by_group_id
.iter()
.position(|g| g.contains(&0))
.unwrap();
state_ids_by_group_id.swap(error_group_index, 0);
let mut group_ids_by_state_id = vec![0; table.states.len()];
for (group_id, state_ids) in state_ids_by_group_id.iter().enumerate() {
for state_id in state_ids {
group_ids_by_state_id[*state_id] = group_id;
}
}
while split_state_id_groups(
&table.states,
&mut state_ids_by_group_id,
&mut group_ids_by_state_id,
1,
lex_states_differ,
) {}
let mut new_states = Vec::with_capacity(state_ids_by_group_id.len());
for state_ids in &state_ids_by_group_id {
let mut new_state = LexState::default();
mem::swap(&mut new_state, &mut table.states[state_ids[0]]);
for (_, advance_action) in &mut new_state.advance_actions {
advance_action.state = group_ids_by_state_id[advance_action.state];
}
if let Some(eof_action) = &mut new_state.eof_action {
eof_action.state = group_ids_by_state_id[eof_action.state];
}
new_states.push(new_state);
}
for state in &mut parse_table.states {
state.lex_state_id = group_ids_by_state_id[state.lex_state_id];
}
table.states = new_states;
}
fn lex_states_differ(left: &LexState, right: &LexState, group_ids_by_state_id: &[usize]) -> bool {
left.advance_actions
.iter()
.zip(right.advance_actions.iter())
.any(|(left, right)| {
group_ids_by_state_id[left.1.state] != group_ids_by_state_id[right.1.state]
})
}
fn sort_states(table: &mut LexTable, parse_table: &mut ParseTable) {
let mut old_ids_by_new_id = (0..table.states.len()).collect::<Vec<_>>();
old_ids_by_new_id[1..].sort_by_key(|id| &table.states[*id]);
let mut new_ids_by_old_id = vec![0; old_ids_by_new_id.len()];
for (id, old_id) in old_ids_by_new_id.iter().enumerate() {
new_ids_by_old_id[*old_id] = id;
}
table.states = old_ids_by_new_id
.iter()
.map(|old_id| {
let mut state = LexState::default();
mem::swap(&mut state, &mut table.states[*old_id]);
for (_, advance_action) in &mut state.advance_actions {
advance_action.state = new_ids_by_old_id[advance_action.state];
}
if let Some(eof_action) = &mut state.eof_action {
eof_action.state = new_ids_by_old_id[eof_action.state];
}
state
})
.collect();
for state in &mut parse_table.states {
state.lex_state_id = new_ids_by_old_id[state.lex_state_id];
}
}