1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
//! Auto-generated module
//!
//! 🤖 Generated with [SplitRS](https://github.com/cool-japan/splitrs)
use super::functions::*;
use std::collections::{HashMap, HashSet};
/// Computes FIRST and FOLLOW sets for a context-free grammar.
#[allow(dead_code)]
pub struct FirstFollowSets<'a> {
/// The grammar.
pub grammar: &'a ContextFreeGrammar,
/// FIRST sets: nonterminal index → set of (`Option<terminal_index>`)
/// None represents ε.
pub first: Vec<HashSet<Option<usize>>>,
/// FOLLOW sets: nonterminal index → set of (`Option<terminal_index>`)
/// None represents end-of-input ($).
pub follow: Vec<HashSet<Option<usize>>>,
}
#[allow(dead_code)]
impl<'a> FirstFollowSets<'a> {
/// Compute FIRST and FOLLOW sets for the grammar.
pub fn compute(grammar: &'a ContextFreeGrammar) -> Self {
let mut first: Vec<HashSet<Option<usize>>> = vec![HashSet::new(); grammar.n_nonterms];
let mut follow: Vec<HashSet<Option<usize>>> = vec![HashSet::new(); grammar.n_nonterms];
let mut changed = true;
while changed {
changed = false;
for (lhs, rhs) in &grammar.productions {
if rhs.is_empty() {
if first[*lhs].insert(None) {
changed = true;
}
} else {
let (is_term, idx) = rhs[0];
if is_term {
if first[*lhs].insert(Some(idx)) {
changed = true;
}
} else if idx < grammar.n_nonterms {
let to_add: Vec<_> = first[idx].iter().cloned().collect();
for item in to_add {
if first[*lhs].insert(item) {
changed = true;
}
}
}
}
}
}
follow[grammar.start].insert(None);
let mut changed = true;
while changed {
changed = false;
for (lhs, rhs) in &grammar.productions {
for (i, &(is_term, idx)) in rhs.iter().enumerate() {
if !is_term && idx < grammar.n_nonterms {
if i + 1 < rhs.len() {
let next = rhs[i + 1];
if next.0 {
if follow[idx].insert(Some(next.1)) {
changed = true;
}
} else if next.1 < grammar.n_nonterms {
let to_add: Vec<_> = first[next.1].iter().cloned().collect();
for item in to_add {
if follow[idx].insert(item) {
changed = true;
}
}
}
} else {
let to_add: Vec<_> = follow[*lhs].iter().cloned().collect();
for item in to_add {
if follow[idx].insert(item) {
changed = true;
}
}
}
}
}
}
}
FirstFollowSets {
grammar,
first,
follow,
}
}
/// Get FIRST set for a nonterminal.
pub fn get_first(&self, nt: usize) -> &HashSet<Option<usize>> {
&self.first[nt]
}
/// Get FOLLOW set for a nonterminal.
pub fn get_follow(&self, nt: usize) -> &HashSet<Option<usize>> {
&self.follow[nt]
}
/// Check if a nonterminal is nullable (ε ∈ FIRST(nt)).
pub fn is_nullable(&self, nt: usize) -> bool {
self.first[nt].contains(&None)
}
}
/// An LL(1) parsing table for a given grammar.
#[allow(dead_code)]
pub struct Ll1Table {
/// For each (nonterminal, terminal) pair, the production index to use.
/// The terminal None represents end-of-input.
pub table: std::collections::HashMap<(usize, Option<usize>), usize>,
/// Number of nonterminals.
pub n_nonterms: usize,
/// Number of terminals.
pub n_terms: usize,
}
#[allow(dead_code)]
impl Ll1Table {
/// Create an empty LL(1) table.
pub fn new(n_nonterms: usize, n_terms: usize) -> Self {
Ll1Table {
table: std::collections::HashMap::new(),
n_nonterms,
n_terms,
}
}
/// Set the production for (nonterminal, terminal).
pub fn set(&mut self, nt: usize, term: Option<usize>, prod: usize) {
self.table.insert((nt, term), prod);
}
/// Look up the production for (nonterminal, terminal).
pub fn get(&self, nt: usize, term: Option<usize>) -> Option<usize> {
self.table.get(&(nt, term)).cloned()
}
/// Check if the table is conflict-free (no cell has more than one production).
/// Since we use a HashMap, each cell has at most one entry by construction.
pub fn is_conflict_free(&self) -> bool {
true
}
/// Count the number of filled table entries.
pub fn entry_count(&self) -> usize {
self.table.len()
}
/// Build an LL(1) table from a grammar and its FIRST/FOLLOW sets.
pub fn build(grammar: &ContextFreeGrammar, sets: &FirstFollowSets<'_>) -> Self {
let mut table = Ll1Table::new(grammar.n_nonterms, grammar.n_terms);
for (prod_idx, (lhs, rhs)) in grammar.productions.iter().enumerate() {
if rhs.is_empty() {
for &follow_item in sets.get_follow(*lhs) {
table.set(*lhs, follow_item, prod_idx);
}
} else {
let (is_term, idx) = rhs[0];
if is_term {
table.set(*lhs, Some(idx), prod_idx);
} else if idx < grammar.n_nonterms {
for &first_item in sets.get_first(idx) {
if let Some(t) = first_item {
table.set(*lhs, Some(t), prod_idx);
} else {
for &follow_item in sets.get_follow(*lhs) {
table.set(*lhs, follow_item, prod_idx);
}
}
}
}
}
}
table
}
}
/// A context-free grammar with nonterminals numbered 0..n_nonterms
/// and terminals numbered 0..n_terms.
#[allow(dead_code)]
pub struct ContextFreeGrammar {
/// Number of nonterminal symbols.
pub n_nonterms: usize,
/// Number of terminal symbols.
pub n_terms: usize,
/// Start symbol (index of nonterminal).
pub start: usize,
/// Productions: list of (lhs_nonterm, rhs_symbols) where rhs_symbols is
/// a list of (is_terminal: bool, index).
pub productions: Vec<(usize, Vec<(bool, usize)>)>,
}
#[allow(dead_code)]
impl ContextFreeGrammar {
/// Create a new CFG.
pub fn new(n_nonterms: usize, n_terms: usize, start: usize) -> Self {
ContextFreeGrammar {
n_nonterms,
n_terms,
start,
productions: Vec::new(),
}
}
/// Add a production: A → rhs.
pub fn add_production(&mut self, lhs: usize, rhs: Vec<(bool, usize)>) {
self.productions.push((lhs, rhs));
}
/// Get all productions for a given nonterminal.
pub fn productions_for(&self, nt: usize) -> Vec<&Vec<(bool, usize)>> {
self.productions
.iter()
.filter_map(|(lhs, rhs)| if *lhs == nt { Some(rhs) } else { None })
.collect()
}
/// Check if any production is nullable (A → ε).
pub fn is_nullable(&self, nt: usize) -> bool {
self.productions
.iter()
.any(|(lhs, rhs)| *lhs == nt && rhs.is_empty())
}
/// Count total number of productions.
pub fn production_count(&self) -> usize {
self.productions.len()
}
/// Check if the grammar is in Chomsky Normal Form:
/// every production is A → BC or A → a.
pub fn is_cnf(&self) -> bool {
for (_, rhs) in &self.productions {
match rhs.len() {
0 => return false,
1 => {
if rhs[0].0 {
} else {
return false;
}
}
2 => {
if rhs[0].0 || rhs[1].0 {
return false;
}
}
_ => return false,
}
}
true
}
}
/// A CYK (Cocke–Younger–Kasami) parser for CFGs in CNF.
#[allow(dead_code)]
pub struct CykParser<'a> {
/// The grammar (must be in CNF).
pub grammar: &'a ContextFreeGrammar,
}
#[allow(dead_code)]
impl<'a> CykParser<'a> {
/// Create a new CYK parser.
pub fn new(grammar: &'a ContextFreeGrammar) -> Self {
CykParser { grammar }
}
/// Run CYK on a sequence of terminal indices.
/// Returns true iff the input is in L(grammar).
pub fn parse(&self, input: &[usize]) -> bool {
let n = input.len();
if n == 0 {
return self.grammar.is_nullable(self.grammar.start);
}
let mut table: Vec<Vec<HashSet<usize>>> = vec![vec![HashSet::new(); n]; n];
for (i, &term) in input.iter().enumerate() {
for (lhs, rhs) in &self.grammar.productions {
if rhs.len() == 1 && rhs[0].0 && rhs[0].1 == term {
table[i][0].insert(*lhs);
}
}
}
for len in 2..=n {
for i in 0..=(n - len) {
for k in 0..(len - 1) {
for (lhs, rhs) in &self.grammar.productions {
if rhs.len() == 2 && !rhs[0].0 && !rhs[1].0 {
let b = rhs[0].1;
let c = rhs[1].1;
if table[i][k].contains(&b)
&& table[i + k + 1][len - k - 2].contains(&c)
{
table[i][len - 1].insert(*lhs);
}
}
}
}
}
}
table[0][n - 1].contains(&self.grammar.start)
}
/// Count the number of parse trees (ambiguity measure).
pub fn count_parses(&self, input: &[usize]) -> usize {
let n = input.len();
if n == 0 {
return if self.grammar.is_nullable(self.grammar.start) {
1
} else {
0
};
}
if self.parse(input) {
1
} else {
0
}
}
}
/// A runtime PEG expression.
#[allow(dead_code)]
#[derive(Debug, Clone)]
pub enum PegExpr {
/// Match a literal character.
Char(char),
/// Match any character.
Any,
/// Sequence: e1 followed by e2.
Seq(Box<PegExpr>, Box<PegExpr>),
/// Ordered choice: try e1, if it fails (without consuming input) try e2.
Choice(Box<PegExpr>, Box<PegExpr>),
/// Zero or more: e*
Star(Box<PegExpr>),
/// One or more: e+
Plus(Box<PegExpr>),
/// Optional: e?
Optional(Box<PegExpr>),
/// Negative lookahead: !e
Not(Box<PegExpr>),
/// Positive lookahead: &e
And(Box<PegExpr>),
}
#[allow(dead_code)]
impl PegExpr {
/// Try to match the expression at position `pos` in `input`.
/// Returns the new position if successful, or None on failure.
pub fn matches(&self, input: &str, pos: usize) -> Option<usize> {
let chars: Vec<char> = input.chars().collect();
self.match_at(&chars, pos)
}
fn match_at(&self, input: &[char], pos: usize) -> Option<usize> {
match self {
PegExpr::Char(c) => {
if pos < input.len() && input[pos] == *c {
Some(pos + 1)
} else {
None
}
}
PegExpr::Any => {
if pos < input.len() {
Some(pos + 1)
} else {
None
}
}
PegExpr::Seq(e1, e2) => {
let p1 = e1.match_at(input, pos)?;
e2.match_at(input, p1)
}
PegExpr::Choice(e1, e2) => e1.match_at(input, pos).or_else(|| e2.match_at(input, pos)),
PegExpr::Star(e) => {
let mut cur = pos;
loop {
match e.match_at(input, cur) {
Some(next) if next > cur => cur = next,
_ => break,
}
}
Some(cur)
}
PegExpr::Plus(e) => {
let first = e.match_at(input, pos)?;
let rest = PegExpr::Star(e.clone()).match_at(input, first)?;
Some(rest)
}
PegExpr::Optional(e) => Some(e.match_at(input, pos).unwrap_or(pos)),
PegExpr::Not(e) => {
if e.match_at(input, pos).is_some() {
None
} else {
Some(pos)
}
}
PegExpr::And(e) => {
e.match_at(input, pos)?;
Some(pos)
}
}
}
/// Check if the expression matches the entire string.
pub fn match_full(&self, input: &str) -> bool {
let chars: Vec<char> = input.chars().collect();
self.match_at(&chars, 0) == Some(chars.len())
}
}