Skip to main content

perl_tokenizer/
trivia.rs

1//! Trivia (comments and whitespace) handling for the Perl parser
2//!
3//! This module provides support for preserving comments and whitespace
4//! in the AST, which is essential for code formatting and refactoring tools.
5
6use perl_ast_v2::{Node, NodeKind};
7use perl_lexer::TokenType;
8use perl_position_tracking::Range;
9
10/// Trivia represents non-semantic tokens like comments and whitespace
11#[derive(Debug, Clone, PartialEq)]
12pub enum Trivia {
13    /// Whitespace (spaces, tabs, etc.)
14    Whitespace(String),
15    /// Single-line comment starting with #
16    LineComment(String),
17    /// POD documentation
18    PodComment(String),
19    /// Newline character(s)
20    Newline,
21}
22
23impl Trivia {
24    /// Convert trivia to a string representation
25    pub fn as_str(&self) -> &str {
26        match self {
27            Trivia::Whitespace(s) => s,
28            Trivia::LineComment(s) => s,
29            Trivia::PodComment(s) => s,
30            Trivia::Newline => "\n",
31        }
32    }
33
34    /// Get the display name for this trivia type
35    pub fn kind_name(&self) -> &'static str {
36        match self {
37            Trivia::Whitespace(_) => "whitespace",
38            Trivia::LineComment(_) => "comment",
39            Trivia::PodComment(_) => "pod",
40            Trivia::Newline => "newline",
41        }
42    }
43}
44
45/// A node with attached trivia
46#[derive(Debug, Clone)]
47pub struct NodeWithTrivia {
48    /// The actual AST node
49    pub node: Node,
50    /// Trivia that appears before this node
51    pub leading_trivia: Vec<TriviaToken>,
52    /// Trivia that appears after this node
53    pub trailing_trivia: Vec<TriviaToken>,
54}
55
56/// A trivia token with position information
57#[derive(Debug, Clone)]
58pub struct TriviaToken {
59    /// The trivia content
60    pub trivia: Trivia,
61    /// The source range of this trivia
62    pub range: Range,
63}
64
65impl TriviaToken {
66    /// Create a new trivia token with the given content and range
67    pub fn new(trivia: Trivia, range: Range) -> Self {
68        TriviaToken { trivia, range }
69    }
70}
71
72/// Extension trait for collecting trivia.
73///
74/// Implement this trait to collect leading and trailing trivia during lexing.
75pub trait TriviaCollector {
76    /// Collect trivia tokens before the next meaningful token
77    fn collect_leading_trivia(&mut self) -> Vec<TriviaToken>;
78
79    /// Collect trivia tokens after a node (typically until newline)
80    fn collect_trailing_trivia(&mut self) -> Vec<TriviaToken>;
81}
82
83/// A lexer wrapper that preserves trivia.
84///
85/// Wraps the Perl lexer to collect comments and whitespace as trivia tokens.
86pub struct TriviaLexer {
87    /// The underlying Perl lexer
88    lexer: perl_lexer::PerlLexer<'static>,
89    /// Source code (owned)
90    source: String,
91    /// Current position for trivia tracking
92    position: usize,
93    /// Buffered trivia tokens
94    _trivia_buffer: Vec<TriviaToken>,
95}
96
97impl TriviaLexer {
98    /// Create a new trivia-preserving lexer
99    pub fn new(source: String) -> Self {
100        // We need to leak the string to get a 'static reference
101        // In a real implementation, we'd use a better lifetime strategy
102        let source_ref: &'static str = Box::leak(source.clone().into_boxed_str());
103
104        TriviaLexer {
105            lexer: perl_lexer::PerlLexer::new(source_ref),
106            source,
107            position: 0,
108            _trivia_buffer: Vec::new(),
109        }
110    }
111
112    /// Get the next token, collecting any preceding trivia.
113    ///
114    /// Returns the token along with any whitespace or comments that precede it.
115    pub fn next_token_with_trivia(&mut self) -> Option<(perl_lexer::Token, Vec<TriviaToken>)> {
116        // First, collect any trivia
117        let trivia = self.collect_trivia();
118
119        // Then get the next meaningful token
120        let token = self.lexer.next_token()?;
121
122        // Sync position past this token so next collect_trivia() starts after it
123        self.position = self.position.max(token.end);
124
125        // Edge case fix: If we hit EOF but have trailing trivia, return it with the EOF token
126        if matches!(token.token_type, TokenType::EOF) {
127            if !trivia.is_empty() {
128                // Return EOF with trailing trivia so it's not lost
129                return Some((token, trivia));
130            }
131            return None;
132        }
133
134        Some((token, trivia))
135    }
136
137    /// Collect trivia tokens at current position
138    fn collect_trivia(&mut self) -> Vec<TriviaToken> {
139        let mut trivia = Vec::new();
140
141        while self.position < self.source.len() {
142            let remaining = &self.source[self.position..];
143
144            // Check for whitespace
145            if let Some(ws_len) = self.whitespace_length(remaining) {
146                let ws = &remaining[..ws_len];
147                let start = self.position;
148                let end = start + ws_len;
149
150                // Check if it's just newlines
151                if ws.chars().all(|c| c == '\n' || c == '\r') {
152                    trivia.push(TriviaToken::new(
153                        Trivia::Newline,
154                        Range::new(
155                            perl_position_tracking::Position::new(start, 0, 0),
156                            perl_position_tracking::Position::new(end, 0, 0),
157                        ),
158                    ));
159                } else {
160                    trivia.push(TriviaToken::new(
161                        Trivia::Whitespace(ws.to_string()),
162                        Range::new(
163                            perl_position_tracking::Position::new(start, 0, 0),
164                            perl_position_tracking::Position::new(end, 0, 0),
165                        ),
166                    ));
167                }
168
169                self.position += ws_len;
170                continue;
171            }
172
173            // Check for comments
174            if remaining.starts_with('#') {
175                let comment_end = remaining.find('\n').unwrap_or(remaining.len());
176                let comment = &remaining[..comment_end];
177                let start = self.position;
178                let end = start + comment_end;
179
180                trivia.push(TriviaToken::new(
181                    Trivia::LineComment(comment.to_string()),
182                    Range::new(
183                        perl_position_tracking::Position::new(start, 0, 0),
184                        perl_position_tracking::Position::new(end, 0, 0),
185                    ),
186                ));
187
188                self.position += comment_end;
189                continue;
190            }
191
192            // Check for POD
193            if remaining.starts_with("=")
194                && (self.position == 0 || self.source.as_bytes()[self.position - 1] == b'\n')
195            {
196                if let Some(pod_end) = self.find_pod_end(remaining) {
197                    let pod = &remaining[..pod_end];
198                    let start = self.position;
199                    let end = start + pod_end;
200
201                    trivia.push(TriviaToken::new(
202                        Trivia::PodComment(pod.to_string()),
203                        Range::new(
204                            perl_position_tracking::Position::new(start, 0, 0),
205                            perl_position_tracking::Position::new(end, 0, 0),
206                        ),
207                    ));
208
209                    self.position += pod_end;
210                    continue;
211                }
212            }
213
214            // No more trivia
215            break;
216        }
217
218        // Sync lexer position
219        if self.position > 0 {
220            // The lexer will skip whitespace internally, so we need to ensure
221            // our position tracking stays in sync
222        }
223
224        trivia
225    }
226
227    /// Calculate the length of whitespace at the start of the string
228    fn whitespace_length(&self, s: &str) -> Option<usize> {
229        let mut len = 0;
230        for ch in s.chars() {
231            if ch.is_whitespace() && ch != '\n' && ch != '\r' {
232                len += ch.len_utf8();
233            } else if ch == '\n' || ch == '\r' {
234                // Handle newlines separately
235                len += ch.len_utf8();
236                // Handle \r\n
237                if ch == '\r' && s[len..].starts_with('\n') {
238                    len += 1;
239                }
240                break;
241            } else {
242                break;
243            }
244        }
245
246        if len > 0 { Some(len) } else { None }
247    }
248
249    /// Find the end of a POD section
250    fn find_pod_end(&self, s: &str) -> Option<usize> {
251        // POD ends with =cut at the beginning of a line
252        let mut pos = 0;
253        for line in s.lines() {
254            if line.trim() == "=cut" {
255                return Some(pos + line.len());
256            }
257            pos += line.len() + 1; // +1 for newline
258        }
259
260        // If no =cut found, POD extends to end of string
261        Some(s.len())
262    }
263}
264
265/// Parser that preserves trivia.
266///
267/// A parser that attaches comments and whitespace to AST nodes for formatting.
268pub struct TriviaPreservingParser {
269    /// Trivia-aware lexer
270    lexer: TriviaLexer,
271    /// Current lookahead token
272    current: Option<(perl_lexer::Token, Vec<TriviaToken>)>,
273    /// Node ID generator
274    id_generator: perl_ast_v2::NodeIdGenerator,
275}
276
277impl TriviaPreservingParser {
278    /// Create a new trivia-preserving parser
279    pub fn new(source: String) -> Self {
280        let mut parser = TriviaPreservingParser {
281            lexer: TriviaLexer::new(source),
282            current: None,
283            id_generator: perl_ast_v2::NodeIdGenerator::new(),
284        };
285        // Prime the lookahead
286        parser.advance();
287        parser
288    }
289
290    /// Advance to the next token
291    fn advance(&mut self) {
292        self.current = self.lexer.next_token_with_trivia();
293    }
294
295    /// Parse and return AST with trivia preserved.
296    ///
297    /// Returns a node with leading and trailing trivia attached.
298    pub fn parse(mut self) -> NodeWithTrivia {
299        let leading_trivia =
300            if let Some((_, trivia)) = &self.current { trivia.clone() } else { Vec::new() };
301
302        // For now, create a simple demonstration node
303        let node = Node::new(
304            self.id_generator.next_id(),
305            NodeKind::Program { statements: Vec::new() },
306            Range::new(
307                perl_position_tracking::Position::new(0, 1, 1),
308                perl_position_tracking::Position::new(0, 1, 1),
309            ),
310        );
311
312        NodeWithTrivia { node, leading_trivia, trailing_trivia: Vec::new() }
313    }
314}
315
316#[cfg(test)]
317mod tests {
318    use super::*;
319    use perl_tdd_support::must_some;
320
321    #[test]
322    fn test_trivia_collection() {
323        let source = "  # comment\n  my $x = 42;".to_string();
324        let mut lexer = TriviaLexer::new(source);
325
326        let (_token, trivia) = must_some(lexer.next_token_with_trivia());
327
328        // Should have whitespace and comment as trivia
329        eprintln!("Trivia count: {}", trivia.len());
330        for (i, t) in trivia.iter().enumerate() {
331            eprintln!("Trivia[{}]: {:?}", i, t.trivia);
332        }
333        assert!(trivia.len() >= 2); // At least whitespace and comment
334        assert!(trivia.iter().any(|t| matches!(&t.trivia, Trivia::Whitespace(_))));
335        assert!(trivia.iter().any(|t| matches!(&t.trivia, Trivia::LineComment(_))));
336    }
337
338    #[test]
339    fn test_pod_preservation() {
340        let source = "=head1 NAME\n\nTest\n\n=cut\n\nmy $x;".to_string();
341        let mut lexer = TriviaLexer::new(source);
342
343        let (_, trivia) = must_some(lexer.next_token_with_trivia());
344
345        // Should have POD as trivia
346        assert!(trivia.iter().any(|t| matches!(&t.trivia, Trivia::PodComment(_))));
347    }
348}