perl_ast_v2/lib.rs
1//! Enhanced AST with full position tracking for incremental parsing
2//!
3//! This module provides an updated AST that uses Range instead of SourceLocation
4//! to support incremental parsing and better error reporting.
5
6use perl_position_tracking::Range;
7
8/// A unique identifier for AST nodes to support incremental parsing.
9pub type NodeId = usize;
10
11/// Index into the diagnostics array in `ParseOutput`.
12///
13/// This type enables lightweight error nodes that reference diagnostics
14/// stored separately from the AST, reducing memory overhead and decoupling
15/// tree structure from human-readable messages.
16pub type DiagnosticId = u32;
17
18/// Kinds of missing syntax elements for error recovery.
19///
20/// This enum provides specific information about what was expected
21/// but not found, enabling better IDE diagnostics and recovery.
22#[derive(Debug, Clone, Copy, PartialEq, Eq)]
23pub enum MissingKind {
24 /// Missing expression (e.g., after `=` in assignment)
25 Expression,
26 /// Missing statement
27 Statement,
28 /// Missing identifier/name
29 Identifier,
30 /// Missing block `{ ... }`
31 Block,
32 /// Missing closing delimiter
33 ClosingDelimiter(char),
34 /// Missing semicolon
35 Semicolon,
36 /// Missing condition (e.g., in `if`)
37 Condition,
38 /// Missing argument
39 Argument,
40 /// Missing operator
41 Operator,
42}
43
44/// Enhanced AST node with full position tracking
45#[derive(Debug, Clone, PartialEq)]
46pub struct Node {
47 /// Unique identifier for this node
48 pub id: NodeId,
49 /// The kind of syntax node
50 pub kind: NodeKind,
51 /// Source range with line/column information
52 pub range: Range,
53}
54
55impl Node {
56 /// Create a new AST node
57 pub fn new(id: NodeId, kind: NodeKind, range: Range) -> Self {
58 Node { id, kind, range }
59 }
60
61 /// Convert to tree-sitter compatible S-expression
62 pub fn to_sexp(&self) -> String {
63 // Delegate to existing implementation
64 self.kind.to_sexp()
65 }
66}
67
68/// The kinds of AST nodes used by the parser.
69///
70/// Each variant represents a specific syntactic construct in the Perl source
71/// and carries the child nodes or data needed to represent that construct.
72#[derive(Debug, Clone, PartialEq)]
73pub enum NodeKind {
74 // Program structure
75 /// Top-level program containing a list of statements.
76 Program {
77 /// Statements contained by the program/root node.
78 statements: Vec<Node>,
79 },
80 /// Block node containing a list of statements.
81 Block {
82 /// Statements inside a block.
83 statements: Vec<Node>,
84 },
85
86 // Declarations
87 /// A single variable declaration (`my`, `our`, `local`, `state`, ...).
88 VariableDeclaration {
89 /// The declarator keyword (e.g. `my`, `our`).
90 declarator: String, // my, our, local, state
91 /// The variable node being declared.
92 variable: Box<Node>,
93 /// Any attributes attached to the declaration.
94 attributes: Vec<String>,
95 /// Optional initializer expression.
96 initializer: Option<Box<Node>>,
97 },
98
99 /// A list-style variable declaration (e.g. `my ($a, $b) = ...`).
100 VariableListDeclaration {
101 /// The declarator keyword.
102 declarator: String,
103 /// Variables declared in the list.
104 variables: Vec<Node>,
105 /// Any attributes attached to the declaration.
106 attributes: Vec<String>,
107 /// Optional initializer for the list.
108 initializer: Option<Box<Node>>,
109 },
110
111 // Variables
112 /// A variable usage with sigil and name (e.g. `$foo`, `@arr`).
113 Variable {
114 /// The sigil character (e.g. `$`, `@`, `%`).
115 sigil: String, // $, @, %, *
116 /// The identifier/name of the variable.
117 name: String,
118 },
119
120 // Error recovery nodes
121 /// An error/recovery node produced during parsing (legacy, rich payload).
122 ///
123 /// This variant embeds the error information directly in the AST node.
124 /// For new code, prefer `ErrorRef` which stores only a diagnostic index.
125 Error {
126 /// Human readable error message.
127 message: String,
128 /// Tokens or node kinds that were expected at this location.
129 expected: Vec<String>,
130 /// Optional partially parsed node for recovery contexts.
131 partial: Option<Box<Node>>,
132 },
133
134 /// Lightweight error node referencing a diagnostic by index.
135 ///
136 /// This is the preferred error representation for memory efficiency.
137 /// The actual diagnostic information is stored in `ParseOutput.diagnostics`
138 /// and can be looked up by the `diag_id`.
139 ///
140 /// # Example
141 ///
142 /// ```ignore
143 /// let output = parser.parse_with_recovery();
144 /// for node in output.ast.walk() {
145 /// if let NodeKind::ErrorRef { diag_id } = &node.kind {
146 /// let diagnostic = &output.diagnostics[*diag_id as usize];
147 /// println!("Error at {:?}: {}", node.range, diagnostic);
148 /// }
149 /// }
150 /// ```
151 ErrorRef {
152 /// Index into the diagnostics array in `ParseOutput`.
153 diag_id: DiagnosticId,
154 },
155
156 /// Placeholder for a missing expression during error recovery.
157 MissingExpression,
158 /// Placeholder for a missing statement during error recovery.
159 MissingStatement,
160 /// Placeholder for a missing identifier during error recovery.
161 MissingIdentifier,
162 /// Placeholder for a missing block during error recovery.
163 MissingBlock,
164
165 /// Specific kind of missing syntax element.
166 ///
167 /// This provides more granular information about what's missing
168 /// without embedding full diagnostic details in the AST.
169 Missing(MissingKind),
170
171 // Include all other variants from original AST...
172 // (Abbreviated for example - would include all original variants)
173
174 // Expressions
175 /// A binary expression (e.g. `a + b`).
176 Binary {
177 /// The operator token as text.
178 op: String,
179 /// Left-hand side expression.
180 left: Box<Node>,
181 /// Right-hand side expression.
182 right: Box<Node>,
183 },
184
185 /// A unary expression (e.g. `-x`, `!flag`).
186 Unary {
187 /// The operator token.
188 op: String,
189 /// The operand expression.
190 operand: Box<Node>,
191 },
192
193 // Control flow
194 /// An `if` control-flow construct, including `elsif` and `else` branches.
195 If {
196 /// The conditional expression.
197 condition: Box<Node>,
198 /// The then-branch block node.
199 then_branch: Box<Node>,
200 /// Zero or more `elsif` branches represented as (condition, block).
201 elsif_branches: Vec<(Node, Node)>,
202 /// Optional else branch.
203 else_branch: Option<Box<Node>>,
204 },
205
206 // Literals
207 /// Numeric literal node.
208 Number {
209 /// The literal text of the number.
210 value: String,
211 },
212 /// String literal node; may be interpolated.
213 String {
214 /// The string contents.
215 value: String,
216 /// Whether the string contains interpolation.
217 interpolated: bool,
218 },
219 /// An identifier token.
220 Identifier {
221 /// The identifier text.
222 name: String,
223 },
224 // Other essential variants...
225}
226
227impl NodeKind {
228 /// Convert to S-expression format
229 pub fn to_sexp(&self) -> String {
230 use NodeKind::*;
231
232 match self {
233 Program { statements } => {
234 let stmts = statements.iter().map(|s| s.to_sexp()).collect::<Vec<_>>().join(" ");
235 format!("(source_file {})", stmts)
236 }
237
238 Block { statements } => {
239 let stmts = statements.iter().map(|s| s.to_sexp()).collect::<Vec<_>>().join(" ");
240 format!("(block {})", stmts)
241 }
242
243 Variable { sigil, name } => {
244 format!("(variable {} {})", sigil, name)
245 }
246
247 Number { value } => format!("(number {})", value),
248
249 String { value, interpolated } => {
250 if *interpolated {
251 format!("(string_interpolated {:?})", value)
252 } else {
253 format!("(string {:?})", value)
254 }
255 }
256
257 Binary { op, left, right } => {
258 format!("(binary_{} {} {})", op, left.to_sexp(), right.to_sexp())
259 }
260
261 Error { message, .. } => format!("(ERROR {})", message),
262 ErrorRef { diag_id } => format!("(ERROR_REF #{})", diag_id),
263
264 MissingExpression => "(MISSING_EXPRESSION)".to_string(),
265 MissingStatement => "(MISSING_STATEMENT)".to_string(),
266 MissingIdentifier => "(MISSING_IDENTIFIER)".to_string(),
267 MissingBlock => "(MISSING_BLOCK)".to_string(),
268 Missing(kind) => format!("(MISSING {:?})", kind),
269
270 // Add other variants...
271 _ => format!("({:?})", self),
272 }
273 }
274}
275
276/// Generator for producing unique `NodeId` values used across the AST.
277///
278/// This utility ensures each constructed `Node` receives a distinct identifier
279/// which is useful for incremental parsing, diffing and node references.
280pub struct NodeIdGenerator {
281 /// The next identifier to hand out.
282 next_id: NodeId,
283}
284
285impl NodeIdGenerator {
286 /// Create a new `NodeIdGenerator` starting at zero.
287 pub fn new() -> Self {
288 NodeIdGenerator { next_id: 0 }
289 }
290
291 /// Return the next unique `NodeId` and advance the generator.
292 pub fn next_id(&mut self) -> NodeId {
293 let id = self.next_id;
294 self.next_id += 1;
295 id
296 }
297}
298
299impl Default for NodeIdGenerator {
300 fn default() -> Self {
301 Self::new()
302 }
303}
304
305#[cfg(test)]
306mod tests {
307 use super::*;
308 use perl_position_tracking::{Position, Range};
309
310 #[test]
311 fn test_node_creation() {
312 let mut id_gen = NodeIdGenerator::new();
313 let range = Range::new(Position::new(0, 1, 1), Position::new(5, 1, 6));
314
315 let node = Node::new(id_gen.next_id(), NodeKind::Number { value: "42".to_string() }, range);
316
317 assert_eq!(node.id, 0);
318 assert_eq!(node.to_sexp(), "(number 42)");
319 }
320
321 #[test]
322 fn test_error_nodes() {
323 let mut id_gen = NodeIdGenerator::new();
324 let range = Range::new(Position::new(0, 1, 1), Position::new(0, 1, 1));
325
326 let error = Node::new(
327 id_gen.next_id(),
328 NodeKind::Error {
329 message: "Unexpected token".to_string(),
330 expected: vec!["identifier".to_string()],
331 partial: None,
332 },
333 range,
334 );
335
336 assert_eq!(error.to_sexp(), "(ERROR Unexpected token)");
337 }
338}