Skip to main content

perl_tokenizer/
trivia_parser.rs

1//! Trivia-preserving parser implementation
2//!
3//! This module provides a parser that preserves comments and whitespace
4//! by attaching them to AST nodes as leading/trailing trivia.
5
6use crate::trivia::{NodeWithTrivia, Trivia, TriviaToken};
7use perl_ast_v2::{Node, NodeIdGenerator, NodeKind};
8use perl_lexer::{PerlLexer, Token, TokenType};
9use perl_position_tracking::{Position, Range};
10use std::collections::VecDeque;
11
12/// Token with trivia information
13#[derive(Debug, Clone)]
14pub(crate) struct TokenWithTrivia {
15    /// The actual token
16    token: Token,
17    /// Leading trivia (comments/whitespace before this token)
18    leading_trivia: Vec<TriviaToken>,
19    /// Token range
20    range: Range,
21}
22
23/// Parser context that preserves trivia
24pub struct TriviaParserContext {
25    /// Source text
26    _source: String,
27    /// Tokens with trivia
28    tokens: VecDeque<TokenWithTrivia>,
29    /// Current token index
30    current: usize,
31    /// Node ID generator
32    id_generator: NodeIdGenerator,
33    /// Position tracker for accurate line/column info
34    position_tracker: PositionTracker,
35}
36
37/// Tracks position in source for accurate line/column information
38struct PositionTracker {
39    /// Line start offsets
40    line_starts: Vec<usize>,
41}
42
43impl PositionTracker {
44    fn new(source: &str) -> Self {
45        let mut line_starts = vec![0];
46        for (i, ch) in source.char_indices() {
47            if ch == '\n' {
48                line_starts.push(i + 1);
49            }
50        }
51        PositionTracker { line_starts }
52    }
53
54    fn offset_to_position(&self, offset: usize) -> Position {
55        let line = self.line_starts.binary_search(&offset).unwrap_or_else(|i| i.saturating_sub(1));
56        let line_start = self.line_starts[line];
57        let column = offset - line_start + 1;
58        Position::new(offset, (line + 1) as u32, column as u32)
59    }
60}
61
62impl TriviaParserContext {
63    /// Create a new trivia-preserving parser context
64    pub fn new(source: String) -> Self {
65        let position_tracker = PositionTracker::new(&source);
66        let mut tokens = VecDeque::new();
67
68        // Custom tokenization that preserves trivia
69        let mut position = 0;
70        let _source_bytes = source.as_bytes();
71
72        while position < source.len() {
73            // Collect leading trivia
74            let _trivia_start = position;
75            let leading_trivia = Self::collect_trivia_at(&source, &mut position);
76
77            if position >= source.len() {
78                break;
79            }
80
81            // Get next meaningful token using the lexer
82            let token_source = &source[position..];
83            let mut lexer = PerlLexer::new(token_source);
84
85            if let Some(token) = lexer.next_token() {
86                // Skip EOF tokens
87                if matches!(token.token_type, TokenType::EOF) {
88                    break;
89                }
90
91                // Adjust token positions to be relative to the full source
92                let adjusted_token = Token::new(
93                    token.token_type.clone(),
94                    token.text.clone(),
95                    position + token.start,
96                    position + token.end,
97                );
98
99                // Create range with proper line/column info
100                let start_pos = position_tracker.offset_to_position(adjusted_token.start);
101                let end_pos = position_tracker.offset_to_position(adjusted_token.end);
102                let range = Range::new(start_pos, end_pos);
103
104                tokens.push_back(TokenWithTrivia {
105                    token: adjusted_token.clone(),
106                    leading_trivia,
107                    range,
108                });
109
110                // Advance position
111                position = adjusted_token.end;
112            } else {
113                break;
114            }
115        }
116
117        // Handle remaining trivia at EOF, or source that was entirely trivia
118        if tokens.is_empty() || position < source.len() {
119            let remaining_trivia = if position < source.len() {
120                Self::collect_trivia_at(&source, &mut position)
121            } else {
122                Vec::new()
123            };
124            if !remaining_trivia.is_empty() || tokens.is_empty() {
125                let trivia = if tokens.is_empty() {
126                    // Source was entirely trivia — re-collect from start
127                    let mut pos = 0;
128                    Self::collect_trivia_at(&source, &mut pos)
129                } else {
130                    remaining_trivia
131                };
132                if !trivia.is_empty() {
133                    let eof_pos = position_tracker.offset_to_position(source.len());
134                    let eof_token =
135                        Token::new(TokenType::EOF, String::new(), source.len(), source.len());
136                    tokens.push_back(TokenWithTrivia {
137                        token: eof_token,
138                        leading_trivia: trivia,
139                        range: Range::new(eof_pos, eof_pos),
140                    });
141                }
142            }
143        }
144
145        TriviaParserContext {
146            _source: source,
147            tokens,
148            current: 0,
149            id_generator: NodeIdGenerator::new(),
150            position_tracker,
151        }
152    }
153
154    /// Collect trivia at the given position
155    fn collect_trivia_at(source: &str, position: &mut usize) -> Vec<TriviaToken> {
156        let mut trivia = Vec::new();
157        let bytes = source.as_bytes();
158
159        while *position < source.len() {
160            let _start = *position;
161            let ch = bytes[*position];
162
163            match ch {
164                // Whitespace
165                b' ' | b'\t' | b'\r' => {
166                    let ws_start = *position;
167                    while *position < source.len()
168                        && matches!(bytes[*position], b' ' | b'\t' | b'\r')
169                    {
170                        *position += 1;
171                    }
172
173                    let ws = &source[ws_start..*position];
174                    trivia.push(TriviaToken::new(
175                        Trivia::Whitespace(ws.to_string()),
176                        Range::new(Position::new(ws_start, 0, 0), Position::new(*position, 0, 0)),
177                    ));
178                }
179
180                // Newline
181                b'\n' => {
182                    trivia.push(TriviaToken::new(
183                        Trivia::Newline,
184                        Range::new(
185                            Position::new(*position, 0, 0),
186                            Position::new(*position + 1, 0, 0),
187                        ),
188                    ));
189                    *position += 1;
190                }
191
192                // Comment
193                b'#' => {
194                    let comment_start = *position;
195                    // Find end of line
196                    while *position < source.len() && bytes[*position] != b'\n' {
197                        *position += 1;
198                    }
199
200                    let comment = &source[comment_start..*position];
201                    trivia.push(TriviaToken::new(
202                        Trivia::LineComment(comment.to_string()),
203                        Range::new(
204                            Position::new(comment_start, 0, 0),
205                            Position::new(*position, 0, 0),
206                        ),
207                    ));
208                }
209
210                // POD documentation
211                b'=' if *position == 0 || (*position > 0 && bytes[*position - 1] == b'\n') => {
212                    // Check if this starts a POD section
213                    let remaining = &source[*position..];
214                    if remaining.starts_with("=pod")
215                        || remaining.starts_with("=head")
216                        || remaining.starts_with("=over")
217                        || remaining.starts_with("=item")
218                        || remaining.starts_with("=back")
219                        || remaining.starts_with("=begin")
220                        || remaining.starts_with("=end")
221                        || remaining.starts_with("=for")
222                        || remaining.starts_with("=encoding")
223                    {
224                        let pod_start = *position;
225
226                        // Edge case fix: Find =cut at start of line (including position 0 or after newline)
227                        let mut found_cut = false;
228                        while *position < source.len() {
229                            // Check for =cut at the start of a line
230                            if (*position == 0 || (*position > 0 && bytes[*position - 1] == b'\n'))
231                                && source[*position..].starts_with("=cut")
232                            {
233                                *position += 4; // Skip "=cut"
234                                // Skip to end of line
235                                while *position < source.len() && bytes[*position] != b'\n' {
236                                    *position += 1;
237                                }
238                                if *position < source.len() {
239                                    *position += 1; // Skip newline
240                                }
241                                found_cut = true;
242                                break;
243                            }
244                            *position += 1;
245                        }
246
247                        // Edge case fix: If no =cut found, POD extends to end of file
248                        if !found_cut {
249                            *position = source.len();
250                        }
251
252                        let pod = &source[pod_start..*position];
253                        trivia.push(TriviaToken::new(
254                            Trivia::PodComment(pod.to_string()),
255                            Range::new(
256                                Position::new(pod_start, 0, 0),
257                                Position::new(*position, 0, 0),
258                            ),
259                        ));
260                    } else {
261                        // Not POD, this is a regular token
262                        break;
263                    }
264                }
265
266                // Non-trivia character
267                _ => {
268                    // Check for Unicode whitespace
269                    if ch >= 128 {
270                        let ch_str = &source[*position..];
271                        if let Some(unicode_ch) = ch_str.chars().next() {
272                            if unicode_ch.is_whitespace() {
273                                let ch_len = unicode_ch.len_utf8();
274                                trivia.push(TriviaToken::new(
275                                    Trivia::Whitespace(unicode_ch.to_string()),
276                                    Range::new(
277                                        Position::new(*position, 0, 0),
278                                        Position::new(*position + ch_len, 0, 0),
279                                    ),
280                                ));
281                                *position += ch_len;
282                                continue;
283                            }
284                        }
285                    }
286
287                    // Not trivia, stop collecting
288                    break;
289                }
290            }
291        }
292
293        trivia
294    }
295
296    /// Get current token with trivia
297    pub(crate) fn current_token(&self) -> Option<&TokenWithTrivia> {
298        self.tokens.get(self.current)
299    }
300
301    /// Advance to next token
302    pub(crate) fn advance(&mut self) -> Option<&TokenWithTrivia> {
303        if self.current < self.tokens.len() {
304            self.current += 1;
305        }
306        self.current_token()
307    }
308
309    /// Check if at end of tokens
310    pub fn is_eof(&self) -> bool {
311        self.current >= self.tokens.len()
312    }
313}
314
315/// Parser that preserves trivia
316pub struct TriviaPreservingParser {
317    context: TriviaParserContext,
318}
319
320impl TriviaPreservingParser {
321    /// Create a new trivia-preserving parser
322    pub fn new(source: String) -> Self {
323        TriviaPreservingParser { context: TriviaParserContext::new(source) }
324    }
325
326    /// Parse the source, preserving trivia
327    pub fn parse(mut self) -> NodeWithTrivia {
328        let start_pos = Position::new(0, 1, 1);
329        let mut statement_nodes = Vec::new();
330
331        // Collect all trivia from all tokens (including EOF) so that
332        // blank lines between statements and inline POD are surfaced.
333        let mut leading_trivia = Vec::new();
334        for token in &self.context.tokens {
335            leading_trivia.extend(token.leading_trivia.iter().cloned());
336        }
337
338        // Parse statements
339        while !self.context.is_eof() {
340            if let Some(stmt) = self.parse_statement() {
341                statement_nodes.push(stmt.node);
342            }
343        }
344
345        let end_pos = if let Some(last_token) = self.context.tokens.back() {
346            last_token.range.end
347        } else {
348            start_pos
349        };
350
351        let program = Node::new(
352            self.context.id_generator.next_id(),
353            NodeKind::Program { statements: statement_nodes },
354            Range::new(start_pos, end_pos),
355        );
356
357        NodeWithTrivia { node: program, leading_trivia, trailing_trivia: Vec::new() }
358    }
359
360    /// Parse a statement with trivia
361    fn parse_statement(&mut self) -> Option<NodeWithTrivia> {
362        let (token, leading_trivia, _token_range) = {
363            let token_with_trivia = self.context.current_token()?;
364            (
365                token_with_trivia.token.clone(),
366                token_with_trivia.leading_trivia.clone(),
367                token_with_trivia.range,
368            )
369        };
370
371        // Simple demonstration: parse variable declarations
372        match &token.token_type {
373            TokenType::Keyword(kw)
374                if matches!(kw.as_ref(), "my" | "our" | "local" | "state" | "field") =>
375            {
376                let start_pos = self.context.position_tracker.offset_to_position(token.start);
377
378                let declarator = kw.to_string();
379                self.context.advance();
380
381                // For demonstration, create a simple node
382                let end_pos = self.context.position_tracker.offset_to_position(token.end);
383
384                let node = Node::new(
385                    self.context.id_generator.next_id(),
386                    NodeKind::Identifier { name: declarator },
387                    Range::new(start_pos, end_pos),
388                );
389
390                // Skip to next statement for demo
391                while !self.context.is_eof() {
392                    if let Some(t) = self.context.current_token() {
393                        if matches!(t.token.token_type, TokenType::Semicolon) {
394                            self.context.advance();
395                            break;
396                        }
397                    }
398                    self.context.advance();
399                }
400
401                Some(NodeWithTrivia { node, leading_trivia, trailing_trivia: Vec::new() })
402            }
403            _ => {
404                // Skip unknown tokens for now
405                self.context.advance();
406                None
407            }
408        }
409    }
410}
411
412/// Format an AST with trivia back to source code
413pub fn format_with_trivia(node: &NodeWithTrivia) -> String {
414    let mut result = String::new();
415
416    // Add leading trivia
417    for trivia in &node.leading_trivia {
418        result.push_str(trivia.trivia.as_str());
419    }
420
421    // Add node content (simplified)
422    result.push_str(&format!("{:?}", node.node.kind));
423
424    // Add trailing trivia
425    for trivia in &node.trailing_trivia {
426        result.push_str(trivia.trivia.as_str());
427    }
428
429    result
430}
431
432#[cfg(test)]
433mod tests {
434    use super::*;
435    #[allow(unused_imports)]
436    use perl_tdd_support::must_some;
437
438    #[test]
439    fn test_trivia_preservation() {
440        let source = r#"#!/usr/bin/perl
441# This is a comment
442  
443my $x = 42;  # end of line comment
444
445=pod
446This is POD documentation
447=cut
448
449our $y;"#
450            .to_string();
451
452        let parser = TriviaPreservingParser::new(source);
453        let result = parser.parse();
454
455        // Check that we have leading trivia
456        assert!(!result.leading_trivia.is_empty());
457
458        // First trivia should be the shebang comment
459        assert!(matches!(
460            &result.leading_trivia[0].trivia,
461            Trivia::LineComment(s) if s.starts_with("#!/usr/bin/perl")
462        ));
463    }
464
465    #[test]
466    fn test_whitespace_preservation() {
467        let source = "  \t  my $x;".to_string();
468        let ctx = TriviaParserContext::new(source);
469
470        let first_token = must_some(ctx.current_token());
471        assert!(!first_token.leading_trivia.is_empty());
472        assert!(matches!(
473            &first_token.leading_trivia[0].trivia,
474            Trivia::Whitespace(ws) if ws == "  \t  "
475        ));
476    }
477}