1use crate::trivia::{NodeWithTrivia, Trivia, TriviaToken};
7use perl_ast_v2::{Node, NodeIdGenerator, NodeKind};
8use perl_lexer::{PerlLexer, Token, TokenType};
9use perl_position_tracking::{Position, Range};
10use std::collections::VecDeque;
11
12#[derive(Debug, Clone)]
14pub(crate) struct TokenWithTrivia {
15 token: Token,
17 leading_trivia: Vec<TriviaToken>,
19 range: Range,
21}
22
23pub struct TriviaParserContext {
25 _source: String,
27 tokens: VecDeque<TokenWithTrivia>,
29 current: usize,
31 id_generator: NodeIdGenerator,
33 position_tracker: PositionTracker,
35}
36
37struct PositionTracker {
39 line_starts: Vec<usize>,
41}
42
43impl PositionTracker {
44 fn new(source: &str) -> Self {
45 let mut line_starts = vec![0];
46 for (i, ch) in source.char_indices() {
47 if ch == '\n' {
48 line_starts.push(i + 1);
49 }
50 }
51 PositionTracker { line_starts }
52 }
53
54 fn offset_to_position(&self, offset: usize) -> Position {
55 let line = self.line_starts.binary_search(&offset).unwrap_or_else(|i| i.saturating_sub(1));
56 let line_start = self.line_starts[line];
57 let column = offset - line_start + 1;
58 Position::new(offset, (line + 1) as u32, column as u32)
59 }
60}
61
62impl TriviaParserContext {
63 pub fn new(source: String) -> Self {
65 let position_tracker = PositionTracker::new(&source);
66 let mut tokens = VecDeque::new();
67
68 let mut position = 0;
70 let _source_bytes = source.as_bytes();
71
72 while position < source.len() {
73 let _trivia_start = position;
75 let leading_trivia = Self::collect_trivia_at(&source, &mut position);
76
77 if position >= source.len() {
78 break;
79 }
80
81 let token_source = &source[position..];
83 let mut lexer = PerlLexer::new(token_source);
84
85 if let Some(token) = lexer.next_token() {
86 if matches!(token.token_type, TokenType::EOF) {
88 break;
89 }
90
91 let adjusted_token = Token::new(
93 token.token_type.clone(),
94 token.text.clone(),
95 position + token.start,
96 position + token.end,
97 );
98
99 let start_pos = position_tracker.offset_to_position(adjusted_token.start);
101 let end_pos = position_tracker.offset_to_position(adjusted_token.end);
102 let range = Range::new(start_pos, end_pos);
103
104 tokens.push_back(TokenWithTrivia {
105 token: adjusted_token.clone(),
106 leading_trivia,
107 range,
108 });
109
110 position = adjusted_token.end;
112 } else {
113 break;
114 }
115 }
116
117 if tokens.is_empty() || position < source.len() {
119 let remaining_trivia = if position < source.len() {
120 Self::collect_trivia_at(&source, &mut position)
121 } else {
122 Vec::new()
123 };
124 if !remaining_trivia.is_empty() || tokens.is_empty() {
125 let trivia = if tokens.is_empty() {
126 let mut pos = 0;
128 Self::collect_trivia_at(&source, &mut pos)
129 } else {
130 remaining_trivia
131 };
132 if !trivia.is_empty() {
133 let eof_pos = position_tracker.offset_to_position(source.len());
134 let eof_token =
135 Token::new(TokenType::EOF, String::new(), source.len(), source.len());
136 tokens.push_back(TokenWithTrivia {
137 token: eof_token,
138 leading_trivia: trivia,
139 range: Range::new(eof_pos, eof_pos),
140 });
141 }
142 }
143 }
144
145 TriviaParserContext {
146 _source: source,
147 tokens,
148 current: 0,
149 id_generator: NodeIdGenerator::new(),
150 position_tracker,
151 }
152 }
153
154 fn collect_trivia_at(source: &str, position: &mut usize) -> Vec<TriviaToken> {
156 let mut trivia = Vec::new();
157 let bytes = source.as_bytes();
158
159 while *position < source.len() {
160 let _start = *position;
161 let ch = bytes[*position];
162
163 match ch {
164 b' ' | b'\t' | b'\r' => {
166 let ws_start = *position;
167 while *position < source.len()
168 && matches!(bytes[*position], b' ' | b'\t' | b'\r')
169 {
170 *position += 1;
171 }
172
173 let ws = &source[ws_start..*position];
174 trivia.push(TriviaToken::new(
175 Trivia::Whitespace(ws.to_string()),
176 Range::new(Position::new(ws_start, 0, 0), Position::new(*position, 0, 0)),
177 ));
178 }
179
180 b'\n' => {
182 trivia.push(TriviaToken::new(
183 Trivia::Newline,
184 Range::new(
185 Position::new(*position, 0, 0),
186 Position::new(*position + 1, 0, 0),
187 ),
188 ));
189 *position += 1;
190 }
191
192 b'#' => {
194 let comment_start = *position;
195 while *position < source.len() && bytes[*position] != b'\n' {
197 *position += 1;
198 }
199
200 let comment = &source[comment_start..*position];
201 trivia.push(TriviaToken::new(
202 Trivia::LineComment(comment.to_string()),
203 Range::new(
204 Position::new(comment_start, 0, 0),
205 Position::new(*position, 0, 0),
206 ),
207 ));
208 }
209
210 b'=' if *position == 0 || (*position > 0 && bytes[*position - 1] == b'\n') => {
212 let remaining = &source[*position..];
214 if remaining.starts_with("=pod")
215 || remaining.starts_with("=head")
216 || remaining.starts_with("=over")
217 || remaining.starts_with("=item")
218 || remaining.starts_with("=back")
219 || remaining.starts_with("=begin")
220 || remaining.starts_with("=end")
221 || remaining.starts_with("=for")
222 || remaining.starts_with("=encoding")
223 {
224 let pod_start = *position;
225
226 let mut found_cut = false;
228 while *position < source.len() {
229 if (*position == 0 || (*position > 0 && bytes[*position - 1] == b'\n'))
231 && source[*position..].starts_with("=cut")
232 {
233 *position += 4; while *position < source.len() && bytes[*position] != b'\n' {
236 *position += 1;
237 }
238 if *position < source.len() {
239 *position += 1; }
241 found_cut = true;
242 break;
243 }
244 *position += 1;
245 }
246
247 if !found_cut {
249 *position = source.len();
250 }
251
252 let pod = &source[pod_start..*position];
253 trivia.push(TriviaToken::new(
254 Trivia::PodComment(pod.to_string()),
255 Range::new(
256 Position::new(pod_start, 0, 0),
257 Position::new(*position, 0, 0),
258 ),
259 ));
260 } else {
261 break;
263 }
264 }
265
266 _ => {
268 if ch >= 128 {
270 let ch_str = &source[*position..];
271 if let Some(unicode_ch) = ch_str.chars().next() {
272 if unicode_ch.is_whitespace() {
273 let ch_len = unicode_ch.len_utf8();
274 trivia.push(TriviaToken::new(
275 Trivia::Whitespace(unicode_ch.to_string()),
276 Range::new(
277 Position::new(*position, 0, 0),
278 Position::new(*position + ch_len, 0, 0),
279 ),
280 ));
281 *position += ch_len;
282 continue;
283 }
284 }
285 }
286
287 break;
289 }
290 }
291 }
292
293 trivia
294 }
295
296 pub(crate) fn current_token(&self) -> Option<&TokenWithTrivia> {
298 self.tokens.get(self.current)
299 }
300
301 pub(crate) fn advance(&mut self) -> Option<&TokenWithTrivia> {
303 if self.current < self.tokens.len() {
304 self.current += 1;
305 }
306 self.current_token()
307 }
308
309 pub fn is_eof(&self) -> bool {
311 self.current >= self.tokens.len()
312 }
313}
314
315pub struct TriviaPreservingParser {
317 context: TriviaParserContext,
318}
319
320impl TriviaPreservingParser {
321 pub fn new(source: String) -> Self {
323 TriviaPreservingParser { context: TriviaParserContext::new(source) }
324 }
325
326 pub fn parse(mut self) -> NodeWithTrivia {
328 let start_pos = Position::new(0, 1, 1);
329 let mut statement_nodes = Vec::new();
330
331 let mut leading_trivia = Vec::new();
334 for token in &self.context.tokens {
335 leading_trivia.extend(token.leading_trivia.iter().cloned());
336 }
337
338 while !self.context.is_eof() {
340 if let Some(stmt) = self.parse_statement() {
341 statement_nodes.push(stmt.node);
342 }
343 }
344
345 let end_pos = if let Some(last_token) = self.context.tokens.back() {
346 last_token.range.end
347 } else {
348 start_pos
349 };
350
351 let program = Node::new(
352 self.context.id_generator.next_id(),
353 NodeKind::Program { statements: statement_nodes },
354 Range::new(start_pos, end_pos),
355 );
356
357 NodeWithTrivia { node: program, leading_trivia, trailing_trivia: Vec::new() }
358 }
359
360 fn parse_statement(&mut self) -> Option<NodeWithTrivia> {
362 let (token, leading_trivia, _token_range) = {
363 let token_with_trivia = self.context.current_token()?;
364 (
365 token_with_trivia.token.clone(),
366 token_with_trivia.leading_trivia.clone(),
367 token_with_trivia.range,
368 )
369 };
370
371 match &token.token_type {
373 TokenType::Keyword(kw)
374 if matches!(kw.as_ref(), "my" | "our" | "local" | "state" | "field") =>
375 {
376 let start_pos = self.context.position_tracker.offset_to_position(token.start);
377
378 let declarator = kw.to_string();
379 self.context.advance();
380
381 let end_pos = self.context.position_tracker.offset_to_position(token.end);
383
384 let node = Node::new(
385 self.context.id_generator.next_id(),
386 NodeKind::Identifier { name: declarator },
387 Range::new(start_pos, end_pos),
388 );
389
390 while !self.context.is_eof() {
392 if let Some(t) = self.context.current_token() {
393 if matches!(t.token.token_type, TokenType::Semicolon) {
394 self.context.advance();
395 break;
396 }
397 }
398 self.context.advance();
399 }
400
401 Some(NodeWithTrivia { node, leading_trivia, trailing_trivia: Vec::new() })
402 }
403 _ => {
404 self.context.advance();
406 None
407 }
408 }
409 }
410}
411
412pub fn format_with_trivia(node: &NodeWithTrivia) -> String {
414 let mut result = String::new();
415
416 for trivia in &node.leading_trivia {
418 result.push_str(trivia.trivia.as_str());
419 }
420
421 result.push_str(&format!("{:?}", node.node.kind));
423
424 for trivia in &node.trailing_trivia {
426 result.push_str(trivia.trivia.as_str());
427 }
428
429 result
430}
431
432#[cfg(test)]
433mod tests {
434 use super::*;
435 #[allow(unused_imports)]
436 use perl_tdd_support::must_some;
437
438 #[test]
439 fn test_trivia_preservation() {
440 let source = r#"#!/usr/bin/perl
441# This is a comment
442
443my $x = 42; # end of line comment
444
445=pod
446This is POD documentation
447=cut
448
449our $y;"#
450 .to_string();
451
452 let parser = TriviaPreservingParser::new(source);
453 let result = parser.parse();
454
455 assert!(!result.leading_trivia.is_empty());
457
458 assert!(matches!(
460 &result.leading_trivia[0].trivia,
461 Trivia::LineComment(s) if s.starts_with("#!/usr/bin/perl")
462 ));
463 }
464
465 #[test]
466 fn test_whitespace_preservation() {
467 let source = " \t my $x;".to_string();
468 let ctx = TriviaParserContext::new(source);
469
470 let first_token = must_some(ctx.current_token());
471 assert!(!first_token.leading_trivia.is_empty());
472 assert!(matches!(
473 &first_token.leading_trivia[0].trivia,
474 Trivia::Whitespace(ws) if ws == " \t "
475 ));
476 }
477}