use crate::grammar::{Grammar, GrammarLike, ProductionId, Symbol, TokenKind};
use crate::lexer::Token;
use super::state::{EarleyItem, StateSet};
#[derive(Debug, Clone)]
pub struct RecognitionResult {
pub state_sets: Vec<StateSet>,
pub accepted: bool,
}
#[derive(Debug, Clone, Copy)]
pub struct Recognizer<'g, G: GrammarLike + ?Sized = Grammar> {
grammar: &'g G,
}
impl<'g, G: GrammarLike + ?Sized> Recognizer<'g, G> {
#[must_use]
pub const fn new(grammar: &'g G) -> Self {
Self { grammar }
}
#[must_use]
pub fn recognize(&self, tokens: &[Token<'_>]) -> RecognitionResult {
let mut state_sets: Vec<StateSet> = (0..=tokens.len()).map(|_| StateSet::new()).collect();
let start_id = self.grammar.start();
if let Some(start) = self.grammar.production(start_id) {
for (alt_idx, _) in start.alternatives.iter().enumerate() {
state_sets[0].push(EarleyItem::new(start_id, alt_idx, 0));
}
}
for k in 0..=tokens.len() {
self.close_set_inner(&mut state_sets, k);
if k < tokens.len() {
self.scan(&mut state_sets, k, tokens[k].kind);
}
}
let accepted = self.is_accepted(&state_sets, tokens.len());
RecognitionResult {
state_sets,
accepted,
}
}
fn close_set_inner(&self, state_sets: &mut [StateSet], k: usize) {
let mut i = 0;
while i < state_sets[k].items().len() {
let item = state_sets[k].items()[i];
if item.is_complete(self.grammar) {
self.complete(state_sets, k, item);
} else if let Some(symbol) = item.next_symbol(self.grammar) {
match symbol {
Symbol::Nonterminal(b) => self.predict(state_sets, k, *b),
Symbol::Terminal(_) => {
}
Symbol::Repeat(_, _) | Symbol::Choice(_) => {
}
}
}
i += 1;
}
}
fn predict(&self, state_sets: &mut [StateSet], k: usize, nonterminal: ProductionId) {
let Some(production) = self.grammar.production(nonterminal) else {
return;
};
for (alt_idx, _) in production.alternatives.iter().enumerate() {
let new_item = EarleyItem::new(nonterminal, alt_idx, k);
state_sets[k].push(new_item);
}
}
fn complete(&self, state_sets: &mut [StateSet], k: usize, completed: EarleyItem) {
let origin = completed.origin;
let waiting: Vec<EarleyItem> = state_sets[origin]
.items()
.iter()
.copied()
.filter(|it| {
matches!(
it.next_symbol(self.grammar),
Some(Symbol::Nonterminal(b)) if *b == completed.production
)
})
.collect();
for it in waiting {
state_sets[k].push(it.advance());
}
}
fn scan(&self, state_sets: &mut [StateSet], k: usize, token: TokenKind) {
let advancing: Vec<EarleyItem> = state_sets[k]
.items()
.iter()
.copied()
.filter(|it| {
matches!(
it.next_symbol(self.grammar),
Some(Symbol::Terminal(t)) if *t == token
)
})
.collect();
for it in advancing {
state_sets[k + 1].push(it.advance());
}
}
fn is_accepted(&self, state_sets: &[StateSet], n: usize) -> bool {
let Some(final_set) = state_sets.get(n) else {
return false;
};
final_set.items().iter().any(|it| {
it.production == self.grammar.start() && it.origin == 0 && it.is_complete(self.grammar)
})
}
}
#[cfg(test)]
mod tests {
#![allow(clippy::expect_used, clippy::panic)]
use super::*;
use crate::grammar::{Alternative, Grammar, IdlVersion, Production, SpecRef};
const TS: SpecRef = SpecRef {
doc: "TEST",
section: "0.0",
};
fn t(kind: TokenKind) -> Token<'static> {
Token::synthetic(kind)
}
const fn prod(id: u32, name: &'static str, alts: &'static [Alternative]) -> Production {
Production {
id: ProductionId(id),
name,
spec_ref: TS,
alternatives: alts,
ast_hint: None,
}
}
const fn alt(symbols: &'static [Symbol]) -> Alternative {
Alternative {
name: None,
symbols,
note: None,
}
}
const G_SINGLE_TERMINAL: Grammar = Grammar {
name: "single",
version: IdlVersion::V4_2,
productions: &[prod(
0,
"a",
&[alt(&[Symbol::Terminal(TokenKind::Keyword("x"))])],
)],
start: ProductionId(0),
token_rules: &[],
};
const G_SEQUENCE: Grammar = Grammar {
name: "seq",
version: IdlVersion::V4_2,
productions: &[prod(
0,
"a",
&[alt(&[
Symbol::Terminal(TokenKind::Keyword("x")),
Symbol::Terminal(TokenKind::Keyword("y")),
])],
)],
start: ProductionId(0),
token_rules: &[],
};
const G_ALTERNATIVES: Grammar = Grammar {
name: "alts",
version: IdlVersion::V4_2,
productions: &[prod(
0,
"a",
&[
alt(&[Symbol::Terminal(TokenKind::Keyword("x"))]),
alt(&[Symbol::Terminal(TokenKind::Keyword("y"))]),
],
)],
start: ProductionId(0),
token_rules: &[],
};
const G_NESTED: Grammar = Grammar {
name: "nested",
version: IdlVersion::V4_2,
productions: &[
prod(
0,
"a",
&[alt(&[
Symbol::Nonterminal(ProductionId(1)),
Symbol::Terminal(TokenKind::Keyword("y")),
])],
),
prod(1, "b", &[alt(&[Symbol::Terminal(TokenKind::Keyword("x"))])]),
],
start: ProductionId(0),
token_rules: &[],
};
const G_LEFT_RECURSIVE: Grammar = Grammar {
name: "left_rec",
version: IdlVersion::V4_2,
productions: &[prod(
0,
"a",
&[
alt(&[
Symbol::Nonterminal(ProductionId(0)),
Symbol::Terminal(TokenKind::Punct("+")),
Symbol::Terminal(TokenKind::Keyword("n")),
]),
alt(&[Symbol::Terminal(TokenKind::Keyword("n"))]),
],
)],
start: ProductionId(0),
token_rules: &[],
};
const G_RIGHT_RECURSIVE: Grammar = Grammar {
name: "right_rec",
version: IdlVersion::V4_2,
productions: &[prod(
0,
"a",
&[
alt(&[
Symbol::Terminal(TokenKind::Keyword("n")),
Symbol::Terminal(TokenKind::Punct("+")),
Symbol::Nonterminal(ProductionId(0)),
]),
alt(&[Symbol::Terminal(TokenKind::Keyword("n"))]),
],
)],
start: ProductionId(0),
token_rules: &[],
};
const G_EPSILON: Grammar = Grammar {
name: "epsilon",
version: IdlVersion::V4_2,
productions: &[prod(
0,
"a",
&[
alt(&[]), alt(&[Symbol::Terminal(TokenKind::Keyword("x"))]),
],
)],
start: ProductionId(0),
token_rules: &[],
};
#[test]
fn recognize_single_terminal_input() {
let r = Recognizer::new(&G_SINGLE_TERMINAL);
let result = r.recognize(&[t(TokenKind::Keyword("x"))]);
assert!(result.accepted);
assert_eq!(result.state_sets.len(), 2);
}
#[test]
fn recognize_two_terminals_in_sequence() {
let r = Recognizer::new(&G_SEQUENCE);
let result = r.recognize(&[t(TokenKind::Keyword("x")), t(TokenKind::Keyword("y"))]);
assert!(result.accepted);
assert_eq!(result.state_sets.len(), 3);
}
#[test]
fn recognize_first_alternative() {
let r = Recognizer::new(&G_ALTERNATIVES);
assert!(r.recognize(&[t(TokenKind::Keyword("x"))]).accepted);
}
#[test]
fn recognize_second_alternative() {
let r = Recognizer::new(&G_ALTERNATIVES);
assert!(r.recognize(&[t(TokenKind::Keyword("y"))]).accepted);
}
#[test]
fn recognize_nonterminal_nesting() {
let r = Recognizer::new(&G_NESTED);
assert!(
r.recognize(&[t(TokenKind::Keyword("x")), t(TokenKind::Keyword("y"))])
.accepted
);
}
#[test]
fn recognize_left_recursive_grammar() {
let r = Recognizer::new(&G_LEFT_RECURSIVE);
let tokens = [
t(TokenKind::Keyword("n")),
t(TokenKind::Punct("+")),
t(TokenKind::Keyword("n")),
t(TokenKind::Punct("+")),
t(TokenKind::Keyword("n")),
];
assert!(r.recognize(&tokens).accepted);
}
#[test]
fn recognize_right_recursive_grammar() {
let r = Recognizer::new(&G_RIGHT_RECURSIVE);
let tokens = [
t(TokenKind::Keyword("n")),
t(TokenKind::Punct("+")),
t(TokenKind::Keyword("n")),
];
assert!(r.recognize(&tokens).accepted);
}
#[test]
fn recognize_epsilon_with_empty_input() {
let r = Recognizer::new(&G_EPSILON);
assert!(r.recognize(&[]).accepted);
}
#[test]
fn recognize_epsilon_with_terminal_input() {
let r = Recognizer::new(&G_EPSILON);
assert!(r.recognize(&[t(TokenKind::Keyword("x"))]).accepted);
}
#[test]
fn rejects_input_for_wrong_terminal() {
let r = Recognizer::new(&G_SINGLE_TERMINAL);
assert!(!r.recognize(&[t(TokenKind::Keyword("y"))]).accepted);
}
#[test]
fn rejects_partial_input() {
let r = Recognizer::new(&G_SEQUENCE);
assert!(!r.recognize(&[t(TokenKind::Keyword("x"))]).accepted);
}
#[test]
fn rejects_extra_input_at_end() {
let r = Recognizer::new(&G_SINGLE_TERMINAL);
assert!(
!r.recognize(&[t(TokenKind::Keyword("x")), t(TokenKind::Keyword("y"))])
.accepted
);
}
#[test]
fn rejects_empty_input_when_grammar_requires_terminal() {
let r = Recognizer::new(&G_SINGLE_TERMINAL);
assert!(!r.recognize(&[]).accepted);
}
#[test]
fn state_set_count_is_tokens_plus_one() {
let r = Recognizer::new(&G_SEQUENCE);
let result = r.recognize(&[
t(TokenKind::Keyword("x")),
t(TokenKind::Keyword("y")),
t(TokenKind::Keyword("y")), ]);
assert_eq!(result.state_sets.len(), 4);
assert!(!result.accepted);
}
#[test]
fn predict_populates_initial_set_with_alternatives() {
let r = Recognizer::new(&G_ALTERNATIVES);
let result = r.recognize(&[]);
assert_eq!(result.state_sets[0].len(), 2);
assert!(
result.state_sets[0]
.items()
.iter()
.all(|it| it.production == ProductionId(0) && it.origin == 0 && it.dot == 0)
);
}
#[test]
fn predict_descends_into_nonterminal() {
let r = Recognizer::new(&G_NESTED);
let result = r.recognize(&[]);
let items = result.state_sets[0].items();
assert!(items.iter().any(|it| it.production == ProductionId(0)));
assert!(items.iter().any(|it| it.production == ProductionId(1)));
}
#[test]
fn complete_advances_parent_item() {
let r = Recognizer::new(&G_NESTED);
let result = r.recognize(&[t(TokenKind::Keyword("x"))]);
let s1 = &result.state_sets[1];
assert!(s1.items().iter().any(|it| it.production == ProductionId(0)
&& it.alternative_index == 0
&& it.dot == 1
&& it.origin == 0));
}
#[test]
fn empty_grammar_accepts_only_empty_input() {
const G_EMPTY_PROD: Grammar = Grammar {
name: "empty_prod",
version: IdlVersion::V4_2,
productions: &[prod(0, "a", &[alt(&[])])],
start: ProductionId(0),
token_rules: &[],
};
let r = Recognizer::new(&G_EMPTY_PROD);
assert!(r.recognize(&[]).accepted);
assert!(!r.recognize(&[t(TokenKind::Keyword("x"))]).accepted);
}
#[test]
fn repeat_and_choice_symbols_are_skipped_phase_zero() {
const G_REPEAT: Grammar = Grammar {
name: "with_repeat",
version: IdlVersion::V4_2,
productions: &[prod(
0,
"a",
&[
alt(&[Symbol::Repeat(
crate::grammar::RepeatKind::Optional,
&[Symbol::Terminal(TokenKind::Keyword("x"))],
)]),
alt(&[Symbol::Terminal(TokenKind::Keyword("y"))]),
],
)],
start: ProductionId(0),
token_rules: &[],
};
let r = Recognizer::new(&G_REPEAT);
assert!(r.recognize(&[t(TokenKind::Keyword("y"))]).accepted);
assert!(!r.recognize(&[t(TokenKind::Keyword("x"))]).accepted);
}
#[test]
fn duplicate_predicts_do_not_explode_state_set() {
let r = Recognizer::new(&G_LEFT_RECURSIVE);
let result = r.recognize(&[t(TokenKind::Keyword("n"))]);
assert!(result.state_sets[0].len() < 10);
}
}