trampoline_parser/ir.rs
1//! Intermediate Representation for the grammar
2//!
3//! These types represent the grammar after DSL processing but before code generation.
4//! This is a scannerless (lexerless) parser - no separate tokenization phase.
5
6use crate::Assoc;
7
8/// A parser rule definition
9#[derive(Debug, Clone)]
10pub struct RuleDef {
11 pub name: String,
12 pub combinator: Combinator,
13}
14
15/// Parser combinators for scannerless parsing
16#[derive(Debug, Clone)]
17pub enum Combinator {
18 /// Reference another rule by name
19 Rule(String),
20 /// Sequence of combinators
21 Sequence(Vec<Combinator>),
22 /// Ordered choice (first match wins, auto-backtrack)
23 Choice(Vec<Combinator>),
24 /// Zero or more
25 ZeroOrMore(Box<Combinator>),
26 /// One or more
27 OneOrMore(Box<Combinator>),
28 /// Optional (zero or one)
29 Optional(Box<Combinator>),
30 /// Parse but discard result
31 Skip(Box<Combinator>),
32 /// Separated list: item (sep item)*
33 SeparatedBy {
34 item: Box<Combinator>,
35 separator: Box<Combinator>,
36 trailing: bool,
37 },
38 /// Pratt expression parsing
39 Pratt(PrattDef),
40 /// AST mapping applied to inner combinator
41 Mapped {
42 inner: Box<Combinator>,
43 mapping: String,
44 },
45
46 // === Character-level primitives ===
47 /// Match a literal string exactly (e.g., "if", "===", "+")
48 Literal(String),
49 /// Match a single character
50 Char(char),
51 /// Match a character class (digit, alpha, etc.)
52 CharClass(CharClass),
53 /// Match a character range (e.g., 'a'..='z')
54 CharRange(char, char),
55 /// Match any single character
56 AnyChar,
57 /// Negative lookahead (match if inner does NOT match, consume nothing)
58 NotFollowedBy(Box<Combinator>),
59 /// Positive lookahead (match if inner matches, consume nothing)
60 FollowedBy(Box<Combinator>),
61 /// Capture the matched text as a string
62 Capture(Box<Combinator>),
63 /// Memoize the result of parsing at each position to avoid exponential backtracking
64 Memoize {
65 /// Unique identifier for this memoization point
66 id: usize,
67 /// The inner combinator to memoize
68 inner: Box<Combinator>,
69 },
70}
71
72/// Allow string literals to be used as Combinator::Literal
73impl From<&str> for Combinator {
74 fn from(s: &str) -> Self {
75 Combinator::Literal(s.to_string())
76 }
77}
78
79/// Built-in character classes
80#[derive(Debug, Clone, Copy, PartialEq, Eq)]
81pub enum CharClass {
82 /// Decimal digit: 0-9
83 Digit,
84 /// Hexadecimal digit: 0-9, a-f, A-F
85 HexDigit,
86 /// Alphabetic: a-z, A-Z
87 Alpha,
88 /// Alphanumeric: a-z, A-Z, 0-9
89 AlphaNumeric,
90 /// Whitespace: space, tab, newline, carriage return
91 Whitespace,
92 /// Identifier start: a-z, A-Z, _, $
93 IdentStart,
94 /// Identifier continue: a-z, A-Z, 0-9, _, $
95 IdentCont,
96}
97
98impl CharClass {
99 /// Check if a character matches this class
100 pub fn matches(self, c: char) -> bool {
101 match self {
102 CharClass::Digit => c.is_ascii_digit(),
103 CharClass::HexDigit => c.is_ascii_hexdigit(),
104 CharClass::Alpha => c.is_ascii_alphabetic(),
105 CharClass::AlphaNumeric => c.is_ascii_alphanumeric(),
106 // ECMAScript whitespace: space, tab, vertical tab, form feed, NBSP, BOM, line terminators
107 CharClass::Whitespace => {
108 matches!(
109 c,
110 ' ' | '\t'
111 | '\x0B'
112 | '\x0C'
113 | '\r'
114 | '\n'
115 | '\u{00A0}'
116 | '\u{FEFF}'
117 | '\u{2028}'
118 | '\u{2029}'
119 ) || c.is_whitespace()
120 }
121 CharClass::IdentStart => c.is_ascii_alphabetic() || c == '_' || c == '$',
122 CharClass::IdentCont => c.is_ascii_alphanumeric() || c == '_' || c == '$',
123 }
124 }
125}
126
127/// Pratt parsing definition for expression parsing
128#[derive(Debug, Clone, Default)]
129/// Pratt expression parser definition.
130///
131/// IMPORTANT: The parser generator does NOT handle whitespace automatically.
132/// All whitespace handling must be done explicitly in the grammar DSL.
133/// This includes whitespace between operators and operands.
134///
135/// For expressions with postfix operators followed by infix operators (e.g., "a.x * b"),
136/// the grammar must ensure whitespace is consumed. Common patterns:
137/// 1. Have the operand rule consume surrounding whitespace
138/// 2. Use pattern-based operators that include whitespace in their patterns
139/// 3. Structure the grammar so postfix chains are parsed as complete units
140///
141/// DO NOT add automatic/hardcoded whitespace handling to the parser generator.
142pub struct PrattDef {
143 /// The operand parser (primary expressions)
144 pub operand: Box<Option<Combinator>>,
145 /// Prefix operators
146 pub prefix_ops: Vec<PrefixOp>,
147 /// Infix operators
148 pub infix_ops: Vec<InfixOp>,
149 /// Postfix operators
150 pub postfix_ops: Vec<PostfixOp>,
151 /// Ternary operator (if any)
152 pub ternary: Option<TernaryOp>,
153}
154
155/// Prefix operator definition
156#[derive(Debug, Clone)]
157pub struct PrefixOp {
158 /// The operator pattern (e.g., Literal("!"), Literal("++"))
159 pub pattern: Box<Combinator>,
160 pub precedence: u8,
161 pub mapping: String,
162}
163
164/// Infix operator definition
165#[derive(Debug, Clone)]
166pub struct InfixOp {
167 /// The operator pattern (e.g., Literal("+"), Literal("==="))
168 pub pattern: Box<Combinator>,
169 pub precedence: u8,
170 pub assoc: Assoc,
171 pub mapping: String,
172}
173
174/// Postfix operator definition
175#[derive(Debug, Clone)]
176pub enum PostfixOp {
177 /// Simple postfix (++, --)
178 Simple {
179 /// The operator pattern (e.g., Literal("++"))
180 pattern: Box<Combinator>,
181 precedence: u8,
182 mapping: String,
183 },
184 /// Call expression: callee(args)
185 Call {
186 /// Open delimiter (e.g., Literal("("))
187 open: Box<Combinator>,
188 /// Close delimiter (e.g., Literal(")"))
189 close: Box<Combinator>,
190 /// Argument separator (e.g., Literal(","))
191 separator: Box<Combinator>,
192 /// Optional rule name for parsing arguments (if None, uses ParseOperand)
193 arg_rule: Option<String>,
194 precedence: u8,
195 mapping: String,
196 },
197 /// Index expression: obj[index]
198 Index {
199 /// Open delimiter (e.g., Literal("["))
200 open: Box<Combinator>,
201 /// Close delimiter (e.g., Literal("]"))
202 close: Box<Combinator>,
203 precedence: u8,
204 mapping: String,
205 },
206 /// Member access: obj.prop
207 Member {
208 /// The dot/accessor pattern (e.g., Literal("."), Literal("?."))
209 pattern: Box<Combinator>,
210 precedence: u8,
211 mapping: String,
212 },
213 /// Rule-based postfix: parses another rule as the suffix
214 /// Used for tagged template literals: tag`template`
215 Rule {
216 /// The name of the rule to parse
217 rule_name: String,
218 precedence: u8,
219 mapping: String,
220 },
221}
222
223/// Ternary operator definition
224#[derive(Debug, Clone)]
225pub struct TernaryOp {
226 /// First operator (e.g., Literal("?"))
227 pub first: Box<Combinator>,
228 /// Second operator (e.g., Literal(":"))
229 pub second: Box<Combinator>,
230 pub precedence: u8,
231 pub mapping: String,
232}
233
234/// Configuration for AST integration
235#[derive(Debug, Clone)]
236pub struct AstConfig {
237 /// External modules to import (e.g., "crate::ast::*")
238 pub imports: Vec<String>,
239 /// Return type of the parse() function
240 pub result_type: Option<String>,
241 /// External span type to use instead of generated Span
242 pub span_type: Option<String>,
243 /// External string type to use instead of String (e.g., "JsString")
244 pub string_type: Option<String>,
245 /// External error type to use instead of generated ParseError
246 pub error_type: Option<String>,
247 /// Whether to generate the internal ParseResult enum (default: true)
248 pub generate_parse_result: bool,
249 /// Whether to generate the internal Span struct (default: true)
250 pub generate_span: bool,
251 /// Whether to generate the internal ParseError struct (default: true)
252 pub generate_parse_error: bool,
253 /// Whether to apply AST mappings during parsing (default: false)
254 pub apply_mappings: bool,
255 /// String dictionary type for string interning (e.g., "StringDict")
256 pub string_dict_type: Option<String>,
257 /// Method to call on string dict to intern a string (default: "get_or_insert")
258 pub string_dict_method: Option<String>,
259 /// Helper functions to include in generated code
260 pub helper_code: Vec<String>,
261 /// Custom ParseResult variants for typed AST nodes
262 pub result_variants: Vec<ResultVariant>,
263}
264
265/// A custom ParseResult variant for typed AST nodes
266#[derive(Debug, Clone)]
267pub struct ResultVariant {
268 /// Name of the variant (e.g., "Expr")
269 pub name: String,
270 /// Rust type it holds (e.g., "Expression")
271 pub rust_type: String,
272 /// Expression to get span, where _ is the value (e.g., "_.span")
273 pub span_expr: Option<String>,
274}
275
276impl Default for AstConfig {
277 fn default() -> Self {
278 Self {
279 imports: Vec::new(),
280 result_type: None,
281 span_type: None,
282 string_type: None,
283 error_type: None,
284 generate_parse_result: true,
285 generate_span: true,
286 generate_parse_error: true,
287 apply_mappings: false,
288 string_dict_type: None,
289 string_dict_method: None,
290 helper_code: Vec::new(),
291 result_variants: Vec::new(),
292 }
293 }
294}
295
296impl AstConfig {
297 pub fn new() -> Self {
298 Self::default()
299 }
300}
301
302// ============================================================================
303// Indexed IR types for state-indexed code generation
304// ============================================================================
305
306/// ID types for indexing combinators in generated code.
307/// Using u16 to keep Work enum variants compact while supporting up to 65535 combinators.
308pub type RuleId = u16;
309pub type SeqId = u16;
310pub type ChoiceId = u16;
311pub type LoopId = u16;
312pub type OptId = u16;
313pub type CapId = u16;
314pub type LookId = u16;
315pub type SkipId = u16;
316pub type SepById = u16;
317pub type PrattId = u16;
318pub type MapId = u16;
319pub type MemoId = u16;
320pub type LitId = u16;
321
322/// Reference to a combinator by its type and index.
323/// Used in generated static tables to reference child combinators.
324#[derive(Debug, Clone, Copy, PartialEq, Eq)]
325pub enum CombRef {
326 /// Reference to a rule
327 Rule(RuleId),
328 /// Reference to a sequence
329 Seq(SeqId),
330 /// Reference to a choice
331 Choice(ChoiceId),
332 /// Reference to zero-or-more loop
333 ZeroOrMore(LoopId),
334 /// Reference to one-or-more loop
335 OneOrMore(LoopId),
336 /// Reference to optional
337 Optional(OptId),
338 /// Reference to a literal
339 Literal(LitId),
340 /// Match a character class
341 CharClass(CharClass),
342 /// Match a character range
343 CharRange(char, char),
344 /// Match a single specific character
345 Char(char),
346 /// Match any character
347 AnyChar,
348 /// Reference to a capture
349 Capture(CapId),
350 /// Reference to not-followed-by lookahead
351 NotFollowedBy(LookId),
352 /// Reference to followed-by lookahead
353 FollowedBy(LookId),
354 /// Reference to a skip
355 Skip(SkipId),
356 /// Reference to separated-by
357 SeparatedBy(SepById),
358 /// Reference to Pratt expression parser
359 Pratt(PrattId),
360 /// Reference to a mapped combinator
361 Mapped(MapId),
362 /// Reference to a memoized combinator
363 Memoize(MemoId),
364}
365
366/// Compiled sequence definition for generated code
367#[derive(Debug, Clone)]
368pub struct CompiledSeqDef {
369 /// References to child combinators
370 pub items: Vec<CombRef>,
371}
372
373/// Compiled choice definition for generated code
374#[derive(Debug, Clone)]
375pub struct CompiledChoiceDef {
376 /// References to alternative combinators
377 pub alts: Vec<CombRef>,
378}
379
380/// Compiled loop definition for generated code (used by ZeroOrMore and OneOrMore)
381#[derive(Debug, Clone)]
382pub struct CompiledLoopDef {
383 /// Reference to the item combinator
384 pub item: CombRef,
385}
386
387/// Compiled optional definition for generated code
388#[derive(Debug, Clone)]
389pub struct CompiledOptDef {
390 /// Reference to the inner combinator
391 pub inner: CombRef,
392}
393
394/// Compiled capture definition for generated code
395#[derive(Debug, Clone)]
396pub struct CompiledCapDef {
397 /// Reference to the inner combinator
398 pub inner: CombRef,
399}
400
401/// Compiled lookahead definition (for NotFollowedBy and FollowedBy)
402#[derive(Debug, Clone)]
403pub struct CompiledLookDef {
404 /// Reference to the inner combinator
405 pub inner: CombRef,
406}
407
408/// Compiled skip definition
409#[derive(Debug, Clone)]
410pub struct CompiledSkipDef {
411 /// Reference to the inner combinator
412 pub inner: CombRef,
413}
414
415/// Compiled separated-by definition
416#[derive(Debug, Clone)]
417pub struct CompiledSepByDef {
418 /// Reference to the item combinator
419 pub item: CombRef,
420 /// Reference to the separator combinator
421 pub separator: CombRef,
422 /// Whether trailing separator is allowed
423 pub trailing: bool,
424}
425
426/// Compiled mapped combinator definition
427#[derive(Debug, Clone)]
428pub struct CompiledMapDef {
429 /// Reference to the inner combinator
430 pub inner: CombRef,
431 /// Index into the mapping function array
432 pub mapping_idx: usize,
433}
434
435/// Compiled memoize definition
436#[derive(Debug, Clone)]
437pub struct CompiledMemoDef {
438 /// The memoization ID (same as Combinator::Memoize::id)
439 pub memo_id: usize,
440 /// Reference to the inner combinator
441 pub inner: CombRef,
442}
443
444/// Compiled rule definition
445#[derive(Debug, Clone)]
446pub struct CompiledRuleDef {
447 /// Rule name
448 pub name: String,
449 /// Reference to the rule's top-level combinator
450 pub entry: CombRef,
451}
452
453/// Information extracted from an operator pattern
454#[derive(Debug, Clone)]
455pub struct PatternInfo {
456 /// The literal operator string (e.g., "+", "===")
457 pub literal: String,
458 /// Whether this is a keyword (needs word boundary check)
459 pub is_keyword: bool,
460 /// Strings that must NOT follow the operator (e.g., "=" for "=" to not match "==")
461 pub not_followed_by: Vec<String>,
462 /// Optional leading rule to parse (e.g., whitespace)
463 pub leading_rule: Option<String>,
464}
465
466/// Compiled prefix operator
467#[derive(Debug, Clone)]
468pub struct CompiledPrefixOp {
469 /// Pattern information
470 pub pattern: PatternInfo,
471 /// Precedence level
472 pub precedence: u8,
473 /// Index into prefix mapping function array
474 pub mapping_idx: usize,
475}
476
477/// Compiled infix operator
478#[derive(Debug, Clone)]
479pub struct CompiledInfixOp {
480 /// Pattern information
481 pub pattern: PatternInfo,
482 /// Precedence level
483 pub precedence: u8,
484 /// Associativity
485 pub assoc: Assoc,
486 /// Index into infix mapping function array
487 pub mapping_idx: usize,
488}
489
490/// Compiled postfix operator
491#[derive(Debug, Clone)]
492pub enum CompiledPostfixOp {
493 /// Simple postfix (++, --)
494 Simple {
495 pattern: PatternInfo,
496 precedence: u8,
497 mapping_idx: usize,
498 },
499 /// Call expression: callee(args)
500 Call {
501 open_lit: String,
502 close_lit: String,
503 sep_lit: String,
504 /// Optional rule to parse for arguments
505 arg_rule: Option<RuleId>,
506 precedence: u8,
507 mapping_idx: usize,
508 },
509 /// Index expression: obj[index]
510 Index {
511 open_lit: String,
512 close_lit: String,
513 precedence: u8,
514 mapping_idx: usize,
515 },
516 /// Member access: obj.prop
517 Member {
518 pattern: PatternInfo,
519 precedence: u8,
520 mapping_idx: usize,
521 },
522 /// Rule-based postfix
523 Rule {
524 rule_id: RuleId,
525 precedence: u8,
526 mapping_idx: usize,
527 },
528}
529
530/// Compiled ternary operator
531#[derive(Debug, Clone)]
532pub struct CompiledTernaryOp {
533 /// First operator literal (e.g., "?")
534 pub first_lit: String,
535 /// Second operator literal (e.g., ":")
536 pub second_lit: String,
537 /// Precedence level
538 pub precedence: u8,
539 /// Index into ternary mapping function array
540 pub mapping_idx: usize,
541}
542
543/// Compiled Pratt expression parser definition
544#[derive(Debug, Clone)]
545pub struct CompiledPrattDef {
546 /// Reference to the operand combinator
547 pub operand: Option<CombRef>,
548 /// Compiled prefix operators
549 pub prefix_ops: Vec<CompiledPrefixOp>,
550 /// Compiled infix operators
551 pub infix_ops: Vec<CompiledInfixOp>,
552 /// Compiled postfix operators
553 pub postfix_ops: Vec<CompiledPostfixOp>,
554 /// Compiled ternary operator (if any)
555 pub ternary: Option<CompiledTernaryOp>,
556 /// Whether any infix operator has a leading rule
557 pub has_infix_with_leading: bool,
558 /// Whether any prefix operator has a leading rule
559 pub has_prefix_with_leading: bool,
560}
561
562/// Compact position info for Pratt parsing work items
563#[derive(Debug, Clone, Copy, Default)]
564pub struct PosInfo {
565 pub start_pos: usize,
566 pub start_line: u32,
567 pub start_column: u32,
568}
569
570/// Postfix variant tag for work items
571#[derive(Debug, Clone, Copy, PartialEq, Eq)]
572pub enum PostfixVariant {
573 Simple,
574 Call,
575 Index,
576 Member,
577 Rule,
578}
579
580/// Ternary stage for work items
581#[derive(Debug, Clone, Copy, PartialEq, Eq)]
582pub enum TernaryStage {
583 First,
584 Second,
585}
586
587/// Leading rule context for Pratt work items
588#[derive(Debug, Clone, Copy)]
589pub enum LeadingRuleContext {
590 Prefix,
591 Infix {
592 op_idx: u8,
593 next_prec: u8,
594 checkpoint: usize,
595 },
596}
597
598/// Index of all combinators in the grammar
599#[derive(Debug, Clone, Default)]
600pub struct CombinatorIndex {
601 /// Compiled rules
602 pub rules: Vec<CompiledRuleDef>,
603 /// Map from rule name to rule ID
604 pub rule_map: std::collections::HashMap<String, RuleId>,
605 /// Compiled sequences
606 pub sequences: Vec<CompiledSeqDef>,
607 /// Compiled choices
608 pub choices: Vec<CompiledChoiceDef>,
609 /// Compiled zero-or-more loops
610 pub zero_or_more: Vec<CompiledLoopDef>,
611 /// Compiled one-or-more loops
612 pub one_or_more: Vec<CompiledLoopDef>,
613 /// Compiled optionals
614 pub optionals: Vec<CompiledOptDef>,
615 /// Compiled captures
616 pub captures: Vec<CompiledCapDef>,
617 /// Compiled not-followed-by lookaheads
618 pub not_followed_by: Vec<CompiledLookDef>,
619 /// Compiled followed-by lookaheads
620 pub followed_by: Vec<CompiledLookDef>,
621 /// Compiled skips
622 pub skips: Vec<CompiledSkipDef>,
623 /// Compiled separated-by
624 pub separated_by: Vec<CompiledSepByDef>,
625 /// Compiled Pratt parsers
626 pub pratts: Vec<CompiledPrattDef>,
627 /// Compiled mapped combinators
628 pub mapped: Vec<CompiledMapDef>,
629 /// Compiled memoized combinators
630 pub memoized: Vec<CompiledMemoDef>,
631 /// Unique literals
632 pub literals: Vec<String>,
633 /// Map from literal string to literal ID
634 pub literal_map: std::collections::HashMap<String, LitId>,
635 /// Mapping function strings (code snippets)
636 pub mappings: Vec<String>,
637}