Skip to main content

trampoline_parser/
ir.rs

1//! Intermediate Representation for the grammar
2//!
3//! These types represent the grammar after DSL processing but before code generation.
4//! This is a scannerless (lexerless) parser - no separate tokenization phase.
5
6use crate::Assoc;
7
8/// A parser rule definition
9#[derive(Debug, Clone)]
10pub struct RuleDef {
11    pub name: String,
12    pub combinator: Combinator,
13}
14
15/// Parser combinators for scannerless parsing
16#[derive(Debug, Clone)]
17pub enum Combinator {
18    /// Reference another rule by name
19    Rule(String),
20    /// Sequence of combinators
21    Sequence(Vec<Combinator>),
22    /// Ordered choice (first match wins, auto-backtrack)
23    Choice(Vec<Combinator>),
24    /// Zero or more
25    ZeroOrMore(Box<Combinator>),
26    /// One or more
27    OneOrMore(Box<Combinator>),
28    /// Optional (zero or one)
29    Optional(Box<Combinator>),
30    /// Parse but discard result
31    Skip(Box<Combinator>),
32    /// Separated list: item (sep item)*
33    SeparatedBy {
34        item: Box<Combinator>,
35        separator: Box<Combinator>,
36        trailing: bool,
37    },
38    /// Pratt expression parsing
39    Pratt(PrattDef),
40    /// AST mapping applied to inner combinator
41    Mapped {
42        inner: Box<Combinator>,
43        mapping: String,
44    },
45
46    // === Character-level primitives ===
47    /// Match a literal string exactly (e.g., "if", "===", "+")
48    Literal(String),
49    /// Match a single character
50    Char(char),
51    /// Match a character class (digit, alpha, etc.)
52    CharClass(CharClass),
53    /// Match a character range (e.g., 'a'..='z')
54    CharRange(char, char),
55    /// Match any single character
56    AnyChar,
57    /// Negative lookahead (match if inner does NOT match, consume nothing)
58    NotFollowedBy(Box<Combinator>),
59    /// Positive lookahead (match if inner matches, consume nothing)
60    FollowedBy(Box<Combinator>),
61    /// Capture the matched text as a string
62    Capture(Box<Combinator>),
63    /// Memoize the result of parsing at each position to avoid exponential backtracking
64    Memoize {
65        /// Unique identifier for this memoization point
66        id: usize,
67        /// The inner combinator to memoize
68        inner: Box<Combinator>,
69    },
70}
71
72/// Allow string literals to be used as Combinator::Literal
73impl From<&str> for Combinator {
74    fn from(s: &str) -> Self {
75        Combinator::Literal(s.to_string())
76    }
77}
78
79/// Built-in character classes
80#[derive(Debug, Clone, Copy, PartialEq, Eq)]
81pub enum CharClass {
82    /// Decimal digit: 0-9
83    Digit,
84    /// Hexadecimal digit: 0-9, a-f, A-F
85    HexDigit,
86    /// Alphabetic: a-z, A-Z
87    Alpha,
88    /// Alphanumeric: a-z, A-Z, 0-9
89    AlphaNumeric,
90    /// Whitespace: space, tab, newline, carriage return
91    Whitespace,
92    /// Identifier start: a-z, A-Z, _, $
93    IdentStart,
94    /// Identifier continue: a-z, A-Z, 0-9, _, $
95    IdentCont,
96}
97
98impl CharClass {
99    /// Check if a character matches this class
100    pub fn matches(self, c: char) -> bool {
101        match self {
102            CharClass::Digit => c.is_ascii_digit(),
103            CharClass::HexDigit => c.is_ascii_hexdigit(),
104            CharClass::Alpha => c.is_ascii_alphabetic(),
105            CharClass::AlphaNumeric => c.is_ascii_alphanumeric(),
106            // ECMAScript whitespace: space, tab, vertical tab, form feed, NBSP, BOM, line terminators
107            CharClass::Whitespace => {
108                matches!(
109                    c,
110                    ' ' | '\t'
111                        | '\x0B'
112                        | '\x0C'
113                        | '\r'
114                        | '\n'
115                        | '\u{00A0}'
116                        | '\u{FEFF}'
117                        | '\u{2028}'
118                        | '\u{2029}'
119                ) || c.is_whitespace()
120            }
121            CharClass::IdentStart => c.is_ascii_alphabetic() || c == '_' || c == '$',
122            CharClass::IdentCont => c.is_ascii_alphanumeric() || c == '_' || c == '$',
123        }
124    }
125}
126
127/// Pratt parsing definition for expression parsing
128#[derive(Debug, Clone, Default)]
129/// Pratt expression parser definition.
130///
131/// IMPORTANT: The parser generator does NOT handle whitespace automatically.
132/// All whitespace handling must be done explicitly in the grammar DSL.
133/// This includes whitespace between operators and operands.
134///
135/// For expressions with postfix operators followed by infix operators (e.g., "a.x * b"),
136/// the grammar must ensure whitespace is consumed. Common patterns:
137/// 1. Have the operand rule consume surrounding whitespace
138/// 2. Use pattern-based operators that include whitespace in their patterns
139/// 3. Structure the grammar so postfix chains are parsed as complete units
140///
141/// DO NOT add automatic/hardcoded whitespace handling to the parser generator.
142pub struct PrattDef {
143    /// The operand parser (primary expressions)
144    pub operand: Box<Option<Combinator>>,
145    /// Prefix operators
146    pub prefix_ops: Vec<PrefixOp>,
147    /// Infix operators
148    pub infix_ops: Vec<InfixOp>,
149    /// Postfix operators
150    pub postfix_ops: Vec<PostfixOp>,
151    /// Ternary operator (if any)
152    pub ternary: Option<TernaryOp>,
153}
154
155/// Prefix operator definition
156#[derive(Debug, Clone)]
157pub struct PrefixOp {
158    /// The operator pattern (e.g., Literal("!"), Literal("++"))
159    pub pattern: Box<Combinator>,
160    pub precedence: u8,
161    pub mapping: String,
162}
163
164/// Infix operator definition
165#[derive(Debug, Clone)]
166pub struct InfixOp {
167    /// The operator pattern (e.g., Literal("+"), Literal("==="))
168    pub pattern: Box<Combinator>,
169    pub precedence: u8,
170    pub assoc: Assoc,
171    pub mapping: String,
172}
173
174/// Postfix operator definition
175#[derive(Debug, Clone)]
176pub enum PostfixOp {
177    /// Simple postfix (++, --)
178    Simple {
179        /// The operator pattern (e.g., Literal("++"))
180        pattern: Box<Combinator>,
181        precedence: u8,
182        mapping: String,
183    },
184    /// Call expression: callee(args)
185    Call {
186        /// Open delimiter (e.g., Literal("("))
187        open: Box<Combinator>,
188        /// Close delimiter (e.g., Literal(")"))
189        close: Box<Combinator>,
190        /// Argument separator (e.g., Literal(","))
191        separator: Box<Combinator>,
192        /// Optional rule name for parsing arguments (if None, uses ParseOperand)
193        arg_rule: Option<String>,
194        precedence: u8,
195        mapping: String,
196    },
197    /// Index expression: obj[index]
198    Index {
199        /// Open delimiter (e.g., Literal("["))
200        open: Box<Combinator>,
201        /// Close delimiter (e.g., Literal("]"))
202        close: Box<Combinator>,
203        precedence: u8,
204        mapping: String,
205    },
206    /// Member access: obj.prop
207    Member {
208        /// The dot/accessor pattern (e.g., Literal("."), Literal("?."))
209        pattern: Box<Combinator>,
210        precedence: u8,
211        mapping: String,
212    },
213    /// Rule-based postfix: parses another rule as the suffix
214    /// Used for tagged template literals: tag`template`
215    Rule {
216        /// The name of the rule to parse
217        rule_name: String,
218        precedence: u8,
219        mapping: String,
220    },
221}
222
223/// Ternary operator definition
224#[derive(Debug, Clone)]
225pub struct TernaryOp {
226    /// First operator (e.g., Literal("?"))
227    pub first: Box<Combinator>,
228    /// Second operator (e.g., Literal(":"))
229    pub second: Box<Combinator>,
230    pub precedence: u8,
231    pub mapping: String,
232}
233
234/// Configuration for AST integration
235#[derive(Debug, Clone)]
236pub struct AstConfig {
237    /// External modules to import (e.g., "crate::ast::*")
238    pub imports: Vec<String>,
239    /// Return type of the parse() function
240    pub result_type: Option<String>,
241    /// External span type to use instead of generated Span
242    pub span_type: Option<String>,
243    /// External string type to use instead of String (e.g., "JsString")
244    pub string_type: Option<String>,
245    /// External error type to use instead of generated ParseError
246    pub error_type: Option<String>,
247    /// Whether to generate the internal ParseResult enum (default: true)
248    pub generate_parse_result: bool,
249    /// Whether to generate the internal Span struct (default: true)
250    pub generate_span: bool,
251    /// Whether to generate the internal ParseError struct (default: true)
252    pub generate_parse_error: bool,
253    /// Whether to apply AST mappings during parsing (default: false)
254    pub apply_mappings: bool,
255    /// String dictionary type for string interning (e.g., "StringDict")
256    pub string_dict_type: Option<String>,
257    /// Method to call on string dict to intern a string (default: "get_or_insert")
258    pub string_dict_method: Option<String>,
259    /// Helper functions to include in generated code
260    pub helper_code: Vec<String>,
261    /// Custom ParseResult variants for typed AST nodes
262    pub result_variants: Vec<ResultVariant>,
263}
264
265/// A custom ParseResult variant for typed AST nodes
266#[derive(Debug, Clone)]
267pub struct ResultVariant {
268    /// Name of the variant (e.g., "Expr")
269    pub name: String,
270    /// Rust type it holds (e.g., "Expression")
271    pub rust_type: String,
272    /// Expression to get span, where _ is the value (e.g., "_.span")
273    pub span_expr: Option<String>,
274}
275
276impl Default for AstConfig {
277    fn default() -> Self {
278        Self {
279            imports: Vec::new(),
280            result_type: None,
281            span_type: None,
282            string_type: None,
283            error_type: None,
284            generate_parse_result: true,
285            generate_span: true,
286            generate_parse_error: true,
287            apply_mappings: false,
288            string_dict_type: None,
289            string_dict_method: None,
290            helper_code: Vec::new(),
291            result_variants: Vec::new(),
292        }
293    }
294}
295
296impl AstConfig {
297    pub fn new() -> Self {
298        Self::default()
299    }
300}
301
302// ============================================================================
303// Indexed IR types for state-indexed code generation
304// ============================================================================
305
306/// ID types for indexing combinators in generated code.
307/// Using u16 to keep Work enum variants compact while supporting up to 65535 combinators.
308pub type RuleId = u16;
309pub type SeqId = u16;
310pub type ChoiceId = u16;
311pub type LoopId = u16;
312pub type OptId = u16;
313pub type CapId = u16;
314pub type LookId = u16;
315pub type SkipId = u16;
316pub type SepById = u16;
317pub type PrattId = u16;
318pub type MapId = u16;
319pub type MemoId = u16;
320pub type LitId = u16;
321
322/// Reference to a combinator by its type and index.
323/// Used in generated static tables to reference child combinators.
324#[derive(Debug, Clone, Copy, PartialEq, Eq)]
325pub enum CombRef {
326    /// Reference to a rule
327    Rule(RuleId),
328    /// Reference to a sequence
329    Seq(SeqId),
330    /// Reference to a choice
331    Choice(ChoiceId),
332    /// Reference to zero-or-more loop
333    ZeroOrMore(LoopId),
334    /// Reference to one-or-more loop
335    OneOrMore(LoopId),
336    /// Reference to optional
337    Optional(OptId),
338    /// Reference to a literal
339    Literal(LitId),
340    /// Match a character class
341    CharClass(CharClass),
342    /// Match a character range
343    CharRange(char, char),
344    /// Match a single specific character
345    Char(char),
346    /// Match any character
347    AnyChar,
348    /// Reference to a capture
349    Capture(CapId),
350    /// Reference to not-followed-by lookahead
351    NotFollowedBy(LookId),
352    /// Reference to followed-by lookahead
353    FollowedBy(LookId),
354    /// Reference to a skip
355    Skip(SkipId),
356    /// Reference to separated-by
357    SeparatedBy(SepById),
358    /// Reference to Pratt expression parser
359    Pratt(PrattId),
360    /// Reference to a mapped combinator
361    Mapped(MapId),
362    /// Reference to a memoized combinator
363    Memoize(MemoId),
364}
365
366/// Compiled sequence definition for generated code
367#[derive(Debug, Clone)]
368pub struct CompiledSeqDef {
369    /// References to child combinators
370    pub items: Vec<CombRef>,
371}
372
373/// Compiled choice definition for generated code
374#[derive(Debug, Clone)]
375pub struct CompiledChoiceDef {
376    /// References to alternative combinators
377    pub alts: Vec<CombRef>,
378}
379
380/// Compiled loop definition for generated code (used by ZeroOrMore and OneOrMore)
381#[derive(Debug, Clone)]
382pub struct CompiledLoopDef {
383    /// Reference to the item combinator
384    pub item: CombRef,
385}
386
387/// Compiled optional definition for generated code
388#[derive(Debug, Clone)]
389pub struct CompiledOptDef {
390    /// Reference to the inner combinator
391    pub inner: CombRef,
392}
393
394/// Compiled capture definition for generated code
395#[derive(Debug, Clone)]
396pub struct CompiledCapDef {
397    /// Reference to the inner combinator
398    pub inner: CombRef,
399}
400
401/// Compiled lookahead definition (for NotFollowedBy and FollowedBy)
402#[derive(Debug, Clone)]
403pub struct CompiledLookDef {
404    /// Reference to the inner combinator
405    pub inner: CombRef,
406}
407
408/// Compiled skip definition
409#[derive(Debug, Clone)]
410pub struct CompiledSkipDef {
411    /// Reference to the inner combinator
412    pub inner: CombRef,
413}
414
415/// Compiled separated-by definition
416#[derive(Debug, Clone)]
417pub struct CompiledSepByDef {
418    /// Reference to the item combinator
419    pub item: CombRef,
420    /// Reference to the separator combinator
421    pub separator: CombRef,
422    /// Whether trailing separator is allowed
423    pub trailing: bool,
424}
425
426/// Compiled mapped combinator definition
427#[derive(Debug, Clone)]
428pub struct CompiledMapDef {
429    /// Reference to the inner combinator
430    pub inner: CombRef,
431    /// Index into the mapping function array
432    pub mapping_idx: usize,
433}
434
435/// Compiled memoize definition
436#[derive(Debug, Clone)]
437pub struct CompiledMemoDef {
438    /// The memoization ID (same as Combinator::Memoize::id)
439    pub memo_id: usize,
440    /// Reference to the inner combinator
441    pub inner: CombRef,
442}
443
444/// Compiled rule definition
445#[derive(Debug, Clone)]
446pub struct CompiledRuleDef {
447    /// Rule name
448    pub name: String,
449    /// Reference to the rule's top-level combinator
450    pub entry: CombRef,
451}
452
453/// Information extracted from an operator pattern
454#[derive(Debug, Clone)]
455pub struct PatternInfo {
456    /// The literal operator string (e.g., "+", "===")
457    pub literal: String,
458    /// Whether this is a keyword (needs word boundary check)
459    pub is_keyword: bool,
460    /// Strings that must NOT follow the operator (e.g., "=" for "=" to not match "==")
461    pub not_followed_by: Vec<String>,
462    /// Optional leading rule to parse (e.g., whitespace)
463    pub leading_rule: Option<String>,
464}
465
466/// Compiled prefix operator
467#[derive(Debug, Clone)]
468pub struct CompiledPrefixOp {
469    /// Pattern information
470    pub pattern: PatternInfo,
471    /// Precedence level
472    pub precedence: u8,
473    /// Index into prefix mapping function array
474    pub mapping_idx: usize,
475}
476
477/// Compiled infix operator
478#[derive(Debug, Clone)]
479pub struct CompiledInfixOp {
480    /// Pattern information
481    pub pattern: PatternInfo,
482    /// Precedence level
483    pub precedence: u8,
484    /// Associativity
485    pub assoc: Assoc,
486    /// Index into infix mapping function array
487    pub mapping_idx: usize,
488}
489
490/// Compiled postfix operator
491#[derive(Debug, Clone)]
492pub enum CompiledPostfixOp {
493    /// Simple postfix (++, --)
494    Simple {
495        pattern: PatternInfo,
496        precedence: u8,
497        mapping_idx: usize,
498    },
499    /// Call expression: callee(args)
500    Call {
501        open_lit: String,
502        close_lit: String,
503        sep_lit: String,
504        /// Optional rule to parse for arguments
505        arg_rule: Option<RuleId>,
506        precedence: u8,
507        mapping_idx: usize,
508    },
509    /// Index expression: obj[index]
510    Index {
511        open_lit: String,
512        close_lit: String,
513        precedence: u8,
514        mapping_idx: usize,
515    },
516    /// Member access: obj.prop
517    Member {
518        pattern: PatternInfo,
519        precedence: u8,
520        mapping_idx: usize,
521    },
522    /// Rule-based postfix
523    Rule {
524        rule_id: RuleId,
525        precedence: u8,
526        mapping_idx: usize,
527    },
528}
529
530/// Compiled ternary operator
531#[derive(Debug, Clone)]
532pub struct CompiledTernaryOp {
533    /// First operator literal (e.g., "?")
534    pub first_lit: String,
535    /// Second operator literal (e.g., ":")
536    pub second_lit: String,
537    /// Precedence level
538    pub precedence: u8,
539    /// Index into ternary mapping function array
540    pub mapping_idx: usize,
541}
542
543/// Compiled Pratt expression parser definition
544#[derive(Debug, Clone)]
545pub struct CompiledPrattDef {
546    /// Reference to the operand combinator
547    pub operand: Option<CombRef>,
548    /// Compiled prefix operators
549    pub prefix_ops: Vec<CompiledPrefixOp>,
550    /// Compiled infix operators
551    pub infix_ops: Vec<CompiledInfixOp>,
552    /// Compiled postfix operators
553    pub postfix_ops: Vec<CompiledPostfixOp>,
554    /// Compiled ternary operator (if any)
555    pub ternary: Option<CompiledTernaryOp>,
556    /// Whether any infix operator has a leading rule
557    pub has_infix_with_leading: bool,
558    /// Whether any prefix operator has a leading rule
559    pub has_prefix_with_leading: bool,
560}
561
562/// Compact position info for Pratt parsing work items
563#[derive(Debug, Clone, Copy, Default)]
564pub struct PosInfo {
565    pub start_pos: usize,
566    pub start_line: u32,
567    pub start_column: u32,
568}
569
570/// Postfix variant tag for work items
571#[derive(Debug, Clone, Copy, PartialEq, Eq)]
572pub enum PostfixVariant {
573    Simple,
574    Call,
575    Index,
576    Member,
577    Rule,
578}
579
580/// Ternary stage for work items
581#[derive(Debug, Clone, Copy, PartialEq, Eq)]
582pub enum TernaryStage {
583    First,
584    Second,
585}
586
587/// Leading rule context for Pratt work items
588#[derive(Debug, Clone, Copy)]
589pub enum LeadingRuleContext {
590    Prefix,
591    Infix {
592        op_idx: u8,
593        next_prec: u8,
594        checkpoint: usize,
595    },
596}
597
598/// Index of all combinators in the grammar
599#[derive(Debug, Clone, Default)]
600pub struct CombinatorIndex {
601    /// Compiled rules
602    pub rules: Vec<CompiledRuleDef>,
603    /// Map from rule name to rule ID
604    pub rule_map: std::collections::HashMap<String, RuleId>,
605    /// Compiled sequences
606    pub sequences: Vec<CompiledSeqDef>,
607    /// Compiled choices
608    pub choices: Vec<CompiledChoiceDef>,
609    /// Compiled zero-or-more loops
610    pub zero_or_more: Vec<CompiledLoopDef>,
611    /// Compiled one-or-more loops
612    pub one_or_more: Vec<CompiledLoopDef>,
613    /// Compiled optionals
614    pub optionals: Vec<CompiledOptDef>,
615    /// Compiled captures
616    pub captures: Vec<CompiledCapDef>,
617    /// Compiled not-followed-by lookaheads
618    pub not_followed_by: Vec<CompiledLookDef>,
619    /// Compiled followed-by lookaheads
620    pub followed_by: Vec<CompiledLookDef>,
621    /// Compiled skips
622    pub skips: Vec<CompiledSkipDef>,
623    /// Compiled separated-by
624    pub separated_by: Vec<CompiledSepByDef>,
625    /// Compiled Pratt parsers
626    pub pratts: Vec<CompiledPrattDef>,
627    /// Compiled mapped combinators
628    pub mapped: Vec<CompiledMapDef>,
629    /// Compiled memoized combinators
630    pub memoized: Vec<CompiledMemoDef>,
631    /// Unique literals
632    pub literals: Vec<String>,
633    /// Map from literal string to literal ID
634    pub literal_map: std::collections::HashMap<String, LitId>,
635    /// Mapping function strings (code snippets)
636    pub mappings: Vec<String>,
637}