parsuna 0.1.0

Parsuna: recoverable, pull-based parsers with precise errors
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
//! First lowering phase: rule bodies → symbolic `Block`/`Op` IR.
//!
//! The output of this phase is a [`Program`] whose blocks reference one
//! another by `BlockId` and reference rule entry points by name. It
//! interns FIRST and SYNC sets so downstream code can refer to them by a
//! small integer, which keeps the generated tables compact and lets two
//! sites share a single table entry when they happen to have the same set.
//!
//! `layout.rs` picks up from here to assign concrete state ids and resolve
//! every `Target` into a `StateId`.

use std::collections::{BTreeSet, HashMap};

use crate::analysis::{self, AnalyzedGrammar, EOF_MARKER};
use crate::grammar::ir::*;
use crate::lowering::{
    FirstSet, FirstSetId, FirstSetPool, LookaheadSeq, SyncSet, SyncSetId, SyncSetPool, TokenInfo,
};

/// Index into [`Program::blocks`]. Used wherever a block needs to refer to
/// another one before state ids exist.
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
pub struct BlockId(
    /// Index into [`Program::blocks`].
    pub u32,
);

/// Control-transfer target used inside a [`Block`]'s ops.
///
/// `Rule(name)` is a call to a top-level rule; the actual block id is
/// looked up via [`Program::rule_entry`] during layout. `Block(id)` is a
/// direct reference to a local block (arm body, loop body, etc.).
#[derive(Clone, Debug)]
pub enum Target {
    /// Call by rule name. Resolved via [`Program::rule_entry`] during layout.
    Rule(String),
    /// Direct reference to a block in the same `Program`.
    Block(BlockId),
}

/// Op in the block-level IR — one level above the state-machine ops in
/// [`crate::lowering::Op`]. `Call` and `Target::Rule` still exist here
/// because we haven't resolved rule calls to block ids yet, and dispatch
/// arms are still a flat list rather than a tree.
#[derive(Clone, Debug)]
pub enum Op {
    /// Emit an `Enter` event for rule-kind `kind`.
    Enter {
        /// Rule-kind id (index into [`Program::rules`]).
        kind: u16,
    },
    /// Emit an `Exit` event for rule-kind `kind`.
    Exit {
        /// Rule-kind id (index into [`Program::rules`]).
        kind: u16,
    },
    /// Consume a token of `kind`; on mismatch, emit an error and recover
    /// to `sync`.
    Expect {
        /// Required token-kind id.
        kind: i16,
        /// Token name, retained only for the diagnostic message.
        token_name: String,
        /// SYNC-set id to recover to on mismatch.
        sync: SyncSetId,
    },
    /// Recursive call into another rule or block.
    Call {
        /// Callee — a sibling block or another rule by name.
        target: Target,
    },
    /// `?` branch — call `body` zero or one times.
    Opt {
        /// FIRST-set id the body opens with.
        first: FirstSetId,
        /// Body to call when the lookahead matches `first`.
        body: Target,
    },
    /// `*` loop — repeatedly call `body` while the lookahead matches.
    Star {
        /// FIRST-set id the body opens with.
        first: FirstSetId,
        /// Body of one iteration.
        body: Target,
    },
    /// `Alt` dispatch — pick one arm based on up to `k` tokens of
    /// lookahead, or recover via `sync`.
    Dispatch {
        /// `(FIRST-set id, arm body)` pairs, one per non-nullable arm.
        arms: Vec<(FirstSetId, Target)>,
        /// True if any arm is nullable — changes the default action from
        /// "error and recover" to "fall through".
        has_eps: bool,
        /// SYNC-set id to recover to on "unexpected token".
        sync: SyncSetId,
    },
}

/// A sequence of ops with a label prefix used for diagnostic display.
///
/// `op_labels[i]` is a human-readable name for `ops[i]` (e.g.
/// `expr:call:atom`) that carries through to the emitted code as a comment
/// and into the debug dumps.
#[derive(Clone)]
pub struct Block {
    /// Human-readable prefix that every op label in this block starts with.
    /// Preserves a rule-hierarchical trail in debug dumps, e.g. `expr:alt1`.
    pub label_prefix: String,
    /// Ops in execution order.
    pub ops: Vec<Op>,
    /// Parallel to `ops`: `op_labels[i]` is the diagnostic label for
    /// `ops[i]`. Kept out-of-band so `Op` stays a plain sum type.
    pub op_labels: Vec<String>,
}

/// The output of the build phase: every block, the entry-block map, and
/// the interned FIRST/SYNC pools.
///
/// `rule_order` preserves grammar declaration order so that the layout
/// phase can walk rules in a deterministic, user-visible order.
/// `public_rule_names` is the subset that becomes part of the generated
/// API (fragments are kept in `rule_entry` but not here).
pub struct Program {
    /// All blocks — rule bodies plus every synthetic sub-block (alt arms,
    /// loop bodies, etc.). Referenced by `BlockId(index)`.
    pub blocks: Vec<Block>,
    /// Rule name → entry block. Covers both public rules and fragments.
    pub rule_entry: HashMap<String, BlockId>,
    /// Rules in grammar declaration order. Used so `layout` numbers
    /// states in a stable, human-readable order.
    pub rule_order: Vec<String>,
    /// Rules that should become public API (fragments excluded).
    pub public_rule_names: BTreeSet<String>,
    /// Token metadata mirroring the analyzed grammar, fragments resolved.
    pub tokens: Vec<TokenInfo>,
    /// Names of the public (non-fragment) rules. A rule's `RuleKind` id
    /// is its index here.
    pub rules: Vec<String>,
    /// Interned FIRST-set pool. Each entry is a `FirstSet` — a list of
    /// `LookaheadSeq`s (i.e. `Vec<Vec<i16>>`). Index by [`FirstSetId`].
    pub first_sets: FirstSetPool,
    /// Interned SYNC-set pool. Each entry is a flat `Vec<i16>` of token
    /// ids. Index by [`SyncSetId`].
    pub sync_sets: SyncSetPool,
    /// Token-kind id reserved for end-of-input.
    pub eof_id: i16,
    /// Token-kind id reserved for lexer ERROR.
    pub error_id: i16,
}

/// Mutable state carried through the recursive descent over rule bodies.
///
/// `current_sync` is the SYNC-set id to stamp on any `Expect` ops we emit
/// inside the rule we are currently lowering — every rule computes its own
/// SYNC (derived from its FOLLOW set) up front so inner sites don't have
/// to recompute it. `current_symbol` is just the label prefix we use for
/// sub-blocks so their op labels read like `expr:alt0:expect:LPAREN`.
struct Builder<'a> {
    ag: &'a AnalyzedGrammar,
    blocks: Vec<Block>,
    first_sets: FirstSetPool,
    first_intern: HashMap<FirstSet, FirstSetId>,
    sync_sets: SyncSetPool,
    sync_intern: HashMap<SyncSet, SyncSetId>,
    current_sync: SyncSetId,
    current_symbol: String,
}

/// Lower every rule in `ag` to a [`Program`]. Walks the rules in grammar
/// order, wrapping public rules in `Enter`/`Exit` structural events and
/// inlining fragment rules raw (they produce no structural events).
pub fn build(ag: &AnalyzedGrammar) -> Program {
    let g = &ag.grammar;
    let rules = collect_rules(g);

    // Assign dense kind ids starting at 1 — EOF (0) and ERROR (-1) are
    // reserved runtime sentinels. Fragments are filtered out because they
    // never become a real token kind at run time.
    let tokens: Vec<TokenInfo> = g
        .tokens
        .iter()
        .filter(|t| !t.is_fragment)
        .enumerate()
        .map(|(i, t)| TokenInfo {
            name: t.name.clone(),
            pattern: resolve_pattern(&t.pattern, g),
            skip: t.skip,
            kind: (i + 1) as i16,
        })
        .collect();
    let error_id: i16 = parsuna_rt::TOKEN_ERROR;
    let eof_id: i16 = parsuna_rt::TOKEN_EOF;

    let mut b = Builder {
        ag,
        blocks: Vec::new(),
        first_sets: Vec::new(),
        first_intern: HashMap::new(),
        sync_sets: Vec::new(),
        sync_intern: HashMap::new(),
        current_sync: 0,
        current_symbol: String::new(),
    };

    let mut rule_entry: HashMap<String, BlockId> = HashMap::new();
    let mut rule_order: Vec<String> = Vec::with_capacity(g.rules.len());
    for rule in &g.rules {
        // Every rule gets its own SYNC set (its FOLLOW plus EOF). Interned
        // up-front and stashed in `current_sync` so every `Expect` emitted
        // inside this rule's body can refer to it without recomputing.
        let sync = compute_sync(ag, &rule.name, eof_id);
        b.current_sync = b.intern_sync(sync);
        b.current_symbol = rule.name.clone();
        let bid = b.new_block(rule.name.clone());
        rule_entry.insert(rule.name.clone(), bid);
        rule_order.push(rule.name.clone());

        let rule_tail = b.ag.follow_k.get(&rule.name).cloned().unwrap_or_default();

        if !rule.is_fragment {
            // Non-fragment rules produce Enter/Exit events so consumers
            // see their subtree in the event stream. Fragments are
            // "transparent" — their body is emitted raw.
            let kind = rule_kind_id(&rules, &rule.name);
            let enter_label = format!("{}:enter", rule.name);
            let exit_label = format!("{}:exit", rule.name);
            b.push_op(bid, Op::Enter { kind }, enter_label);
            b.emit_expr(&rule.body, bid, &rule_tail);
            b.push_op(bid, Op::Exit { kind }, exit_label);
        } else {
            b.emit_expr(&rule.body, bid, &rule_tail);
        }
    }

    let public_rule_names: BTreeSet<String> = g
        .rules
        .iter()
        .filter(|r| !r.is_fragment)
        .map(|r| r.name.clone())
        .collect();
    let Builder {
        blocks,
        first_sets,
        sync_sets,
        ..
    } = b;
    Program {
        blocks,
        rule_entry,
        rule_order,
        public_rule_names,
        tokens,
        rules,
        first_sets,
        sync_sets,
        eof_id,
        error_id,
    }
}

impl Builder<'_> {
    /// Allocate a fresh empty block and return its id.
    fn new_block(&mut self, label_prefix: String) -> BlockId {
        let id = BlockId(self.blocks.len() as u32);
        self.blocks.push(Block {
            label_prefix,
            ops: Vec::new(),
            op_labels: Vec::new(),
        });
        id
    }

    /// Append an op (with its matching diagnostic label) to a block.
    fn push_op(&mut self, bid: BlockId, op: Op, label: String) {
        let block = &mut self.blocks[bid.0 as usize];
        block.ops.push(op);
        block.op_labels.push(label);
    }

    /// Intern a FIRST set (a list of token-id sequences) and return its id.
    ///
    /// Sort+dedup first so two call sites that produce the same set via
    /// different orderings still hash to one entry — that turns N
    /// syntactically-similar grammar fragments into a single shared table
    /// row in the final output.
    fn intern_first(&mut self, mut seqs: FirstSet) -> FirstSetId {
        seqs.sort();
        seqs.dedup();
        if let Some(id) = self.first_intern.get(&seqs) {
            return *id;
        }
        let id = self.first_sets.len() as FirstSetId;
        self.first_intern.insert(seqs.clone(), id);
        self.first_sets.push(seqs);
        id
    }

    /// Intern a SYNC set (a list of token ids). Same sort+dedup strategy
    /// as `intern_first`.
    fn intern_sync(&mut self, mut toks: SyncSet) -> SyncSetId {
        toks.sort();
        toks.dedup();
        if let Some(id) = self.sync_intern.get(&toks) {
            return *id;
        }
        let id = self.sync_sets.len() as SyncSetId;
        self.sync_intern.insert(toks.clone(), id);
        self.sync_sets.push(toks);
        id
    }

    fn first_of_expr(&self, e: &Expr) -> analysis::FirstSet {
        analysis::first_follow::first_of(e, &self.ag.nullable, &self.ag.first, self.ag.k)
    }

    /// Intern a FIRST set after converting its string-name sequences into
    /// numeric token ids and dropping the ε sequence — dispatches use
    /// nullability as a separate bit (`has_eps`) rather than a sentinel
    /// inside the set.
    fn first_ids(&mut self, first: &analysis::FirstSet) -> FirstSetId {
        let seqs: FirstSet = first
            .iter()
            .filter(|seq| !seq.is_empty())
            .map(|seq| seq.iter().map(|n| token_kind(self.ag, n)).collect::<LookaheadSeq>())
            .collect();
        self.intern_first(seqs)
    }

    /// Build a sub-block for a piece of grammar (an alternative arm, a
    /// `*`-body, etc.), emit `body_expr` into it, and return the block id.
    ///
    /// Swaps `current_symbol` for the duration so the sub-block's ops get
    /// hierarchical labels (e.g. `expr:alt0:call:atom` rather than
    /// `expr:call:atom`) which is much easier to read in dumps.
    fn build_sub(
        &mut self,
        sub_suffix: &str,
        body_expr: &Expr,
        tail: &analysis::FirstSet,
    ) -> BlockId {
        let sub_label = format!("{}:{}", self.current_symbol, sub_suffix);
        let sub = self.new_block(sub_label.clone());
        let saved = std::mem::replace(&mut self.current_symbol, sub_label);
        self.emit_expr(body_expr, sub, tail);
        self.current_symbol = saved;
        sub
    }

    /// Lower one grammar expression into ops appended to `cur`.
    ///
    /// `tail` is the FIRST(k) of "everything that follows this expression
    /// in the enclosing context". It's needed to compute accurate
    /// prediction sets for nullable arms and loop bodies — if an arm can
    /// match ε, what actually distinguishes it is the tokens that would
    /// follow, not the arm itself.
    fn emit_expr(&mut self, e: &Expr, cur: BlockId, tail: &analysis::FirstSet) {
        let k = self.ag.k;
        match e {
            Expr::Empty => {}
            Expr::Token(name) => {
                let kind = token_kind(self.ag, name);
                let label = format!("{}:expect:{}", self.current_symbol, name);
                self.push_op(
                    cur,
                    Op::Expect {
                        kind,
                        token_name: name.clone(),
                        sync: self.current_sync,
                    },
                    label,
                );
            }
            Expr::Rule(name) => {
                let label = format!("{}:call:{}", self.current_symbol, name);
                self.push_op(
                    cur,
                    Op::Call {
                        target: Target::Rule(name.clone()),
                    },
                    label,
                );
            }
            Expr::Seq(xs) => {
                // Compute the tail for every position right-to-left: the
                // tail after xs[i] is FIRST(xs[i+1..]) concatenated with
                // the outer tail. Each child is then emitted with the
                // correct forward-looking context.
                let n = xs.len();
                let mut succ: Vec<analysis::FirstSet> = vec![tail.clone(); n + 1];
                for i in (0..n).rev() {
                    let fx = self.first_of_expr(&xs[i]);
                    succ[i] = analysis::first_follow::concat_k(&fx, &succ[i + 1], k);
                }
                for i in 0..n {
                    self.emit_expr(&xs[i], cur, &succ[i + 1]);
                }
            }
            Expr::Alt(xs) => {
                let firsts: Vec<analysis::FirstSet> =
                    xs.iter().map(|x| self.first_of_expr(x)).collect();

                // `has_eps` is hoisted out of the per-arm FIRST sets
                // because the dispatch node needs it as a single bit —
                // "is any arm nullable?". Nullable arms are handled via a
                // fall-through default, not by putting ε in a FIRST set.
                let has_eps = firsts.iter().any(|f| f.iter().any(|seq| seq.is_empty()));
                let mut arms: Vec<(FirstSetId, Target)> = Vec::new();
                for (idx, (arm_expr, first)) in xs.iter().zip(firsts.iter()).enumerate() {
                    // A wholly nullable arm (only ε in its FIRST) is
                    // already covered by `has_eps` fall-through — no need
                    // to dispatch into it.
                    if first.iter().all(|seq| seq.is_empty()) {
                        continue;
                    }

                    // Extend FIRST with the surrounding tail so nullable
                    // pieces of the arm are disambiguated by what follows
                    // the whole alternation.
                    let predict = analysis::first_follow::concat_k(first, tail, k);
                    let (non_eps, _) = analysis::first_follow::split_nullable(&predict);
                    let fid = self.first_ids(&non_eps);

                    let sub = self.build_sub(&format!("alt{}", idx), arm_expr, tail);
                    arms.push((fid, Target::Block(sub)));
                }
                let label = format!("{}:dispatch", self.current_symbol);
                self.push_op(
                    cur,
                    Op::Dispatch {
                        arms,
                        has_eps,
                        sync: self.current_sync,
                    },
                    label,
                );
            }
            Expr::Opt(x) => {
                let first = self.first_of_expr(x);
                let fid = self.first_ids(&first);

                let sub = self.build_sub("opt-body", x, tail);
                let label = format!("{}:opt", self.current_symbol);
                self.push_op(
                    cur,
                    Op::Opt {
                        first: fid,
                        body: Target::Block(sub),
                    },
                    label,
                );
            }
            Expr::Star(x) => {
                let first = self.first_of_expr(x);
                let fid = self.first_ids(&first);

                // Inside a star, each body iteration can be followed by
                // another iteration *or* by the outer tail — the body's
                // tail therefore needs to include FIRST(body) prepended.
                let body_tail = analysis::first_follow::concat_k(&first, tail, k);
                let sub = self.build_sub("star-body", x, &body_tail);
                let label = format!("{}:star", self.current_symbol);
                self.push_op(
                    cur,
                    Op::Star {
                        first: fid,
                        body: Target::Block(sub),
                    },
                    label,
                );
            }
            Expr::Plus(x) => {
                // `x+` = `x x*`: emit one guaranteed call, then the same
                // body as a `*` loop. Sharing the sub-block between the
                // call and the loop means the body is lowered once.
                let first = self.first_of_expr(x);
                let fid = self.first_ids(&first);
                let body_tail = analysis::first_follow::concat_k(&first, tail, k);
                let sub = self.build_sub("plus-body", x, &body_tail);
                let call_label = format!("{}:plus-first", self.current_symbol);
                self.push_op(
                    cur,
                    Op::Call {
                        target: Target::Block(sub),
                    },
                    call_label,
                );
                let star_label = format!("{}:plus-star", self.current_symbol);
                self.push_op(
                    cur,
                    Op::Star {
                        first: fid,
                        body: Target::Block(sub),
                    },
                    star_label,
                );
            }
        }
    }
}

/// Names of the public (non-fragment) rules, in declaration order.
/// Fragments are excluded because they don't become `RuleKind` variants.
fn collect_rules(g: &Grammar) -> Vec<String> {
    let mut set: BTreeSet<String> = BTreeSet::new();
    for r in &g.rules {
        if r.is_fragment {
            continue;
        }
        set.insert(r.name.clone());
    }
    set.into_iter().collect()
}

/// SYNC set for a rule: every real token in its FOLLOW plus EOF. `EOF`
/// always sits at the end so the recovery loop will stop if nothing else
/// catches it, rather than running off the input.
fn compute_sync(ag: &AnalyzedGrammar, rule_name: &str, eof_id: i16) -> SyncSet {
    let follow = ag.follow.get(rule_name).cloned().unwrap_or_default();
    let mut names: Vec<&str> = follow
        .iter()
        .filter(|t| t.as_str() != EOF_MARKER)
        .map(|t| t.as_str())
        .collect();
    names.sort();
    names.dedup();
    let mut ids: SyncSet = names.iter().map(|n| token_kind(ag, n)).collect();
    ids.push(eof_id);
    ids
}

/// Look up the numeric kind id of a token by name. Fragments are excluded
/// from the numbering (they never appear at run time), so the id is 1-based
/// over the non-fragment tokens.
fn token_kind(ag: &AnalyzedGrammar, name: &str) -> i16 {
    ag.grammar
        .tokens
        .iter()
        .filter(|t| !t.is_fragment)
        .position(|t| t.name == name)
        .map(|i| (i + 1) as i16)
        .unwrap_or(0)
}

/// Inline every `TokenPattern::Ref` into the referenced token's body.
/// Fragments have already been validated to exist and be acyclic, so the
/// recursion always terminates.
fn resolve_pattern(p: &TokenPattern, g: &Grammar) -> TokenPattern {
    match p {
        TokenPattern::Empty | TokenPattern::Literal(_) | TokenPattern::Class(_) => p.clone(),
        TokenPattern::Ref(n) => match g.token(n) {
            Some(td) => resolve_pattern(&td.pattern, g),
            None => TokenPattern::Empty,
        },
        TokenPattern::Seq(xs) => {
            TokenPattern::Seq(xs.iter().map(|x| resolve_pattern(x, g)).collect())
        }
        TokenPattern::Alt(xs) => {
            TokenPattern::Alt(xs.iter().map(|x| resolve_pattern(x, g)).collect())
        }
        TokenPattern::Opt(x) => TokenPattern::Opt(Box::new(resolve_pattern(x, g))),
        TokenPattern::Star(x) => TokenPattern::Star(Box::new(resolve_pattern(x, g))),
        TokenPattern::Plus(x) => TokenPattern::Plus(Box::new(resolve_pattern(x, g))),
    }
}

/// Numeric id of a rule, as its position in `rules` (public rules only).
fn rule_kind_id(rules: &[String], name: &str) -> u16 {
    rules
        .iter()
        .position(|n| n == name)
        .map(|i| i as u16)
        .unwrap_or(0)
}