perl-tokenizer 0.12.2

//! Trivia-preserving parser implementation
//!
//! This module provides a parser that preserves comments and whitespace
//! by attaching them to AST nodes as leading/trailing trivia.

use crate::trivia::{NodeWithTrivia, Trivia, TriviaToken};
use perl_ast::v2::{Node, NodeIdGenerator, NodeKind};
use perl_lexer::{PerlLexer, Token, TokenType};
use perl_position_tracking::{Position, Range};
use std::collections::VecDeque;

/// Token with trivia information
#[derive(Debug, Clone)]
pub(crate) struct TokenWithTrivia {
    /// The actual token
    token: Token,
    /// Leading trivia (comments/whitespace before this token)
    leading_trivia: Vec<TriviaToken>,
    /// Token range
    range: Range,
}

/// Parser context that preserves trivia
pub struct TriviaParserContext {
    /// Source text
    _source: String,
    /// Tokens with trivia
    tokens: VecDeque<TokenWithTrivia>,
    /// Current token index
    current: usize,
    /// Node ID generator
    id_generator: NodeIdGenerator,
    /// Position tracker for accurate line/column info
    position_tracker: PositionTracker,
}

/// Tracks position in source for accurate line/column information
struct PositionTracker {
    /// Line start offsets
    line_starts: Vec<usize>,
}

impl PositionTracker {
    fn new(source: &str) -> Self {
        let mut line_starts = vec![0];
        for (i, ch) in source.char_indices() {
            if ch == '\n' {
                line_starts.push(i + 1);
            }
        }
        PositionTracker { line_starts }
    }

    fn offset_to_position(&self, offset: usize) -> Position {
        let line = self.line_starts.binary_search(&offset).unwrap_or_else(|i| i.saturating_sub(1));
        let line_start = self.line_starts[line];
        let column = offset - line_start + 1;
        Position::new(offset, (line + 1) as u32, column as u32)
    }
}

impl TriviaParserContext {
    /// Create a new trivia-preserving parser context
    pub fn new(source: String) -> Self {
        let position_tracker = PositionTracker::new(&source);
        let mut tokens = VecDeque::new();

        // Custom tokenization that preserves trivia
        let mut position = 0;
        let _source_bytes = source.as_bytes();

        while position < source.len() {
            // Collect leading trivia
            let _trivia_start = position;
            let leading_trivia = Self::collect_trivia_at(&source, &mut position);

            if position >= source.len() {
                break;
            }

            // Get next meaningful token using the lexer
            let token_source = &source[position..];
            let mut lexer = PerlLexer::new(token_source);

            if let Some(token) = lexer.next_token() {
                // Skip EOF tokens
                if matches!(token.token_type, TokenType::EOF) {
                    break;
                }

                // Adjust token positions to be relative to the full source
                let adjusted_token = Token::new(
                    token.token_type.clone(),
                    token.text.clone(),
                    position + token.start,
                    position + token.end,
                );

                // Create range with proper line/column info
                let start_pos = position_tracker.offset_to_position(adjusted_token.start);
                let end_pos = position_tracker.offset_to_position(adjusted_token.end);
                let range = Range::new(start_pos, end_pos);

                tokens.push_back(TokenWithTrivia {
                    token: adjusted_token.clone(),
                    leading_trivia,
                    range,
                });

                // Advance position
                position = adjusted_token.end;
            } else {
                break;
            }
        }

        TriviaParserContext {
            _source: source,
            tokens,
            current: 0,
            id_generator: NodeIdGenerator::new(),
            position_tracker,
        }
    }

    /// Collect trivia at the given position
    fn collect_trivia_at(source: &str, position: &mut usize) -> Vec<TriviaToken> {
        let mut trivia = Vec::new();
        let bytes = source.as_bytes();

        while *position < source.len() {
            let _start = *position;
            let ch = bytes[*position];

            match ch {
                // Whitespace
                b' ' | b'\t' | b'\r' => {
                    let ws_start = *position;
                    while *position < source.len()
                        && matches!(bytes[*position], b' ' | b'\t' | b'\r')
                    {
                        *position += 1;
                    }

                    let ws = &source[ws_start..*position];
                    trivia.push(TriviaToken::new(
                        Trivia::Whitespace(ws.to_string()),
                        Range::new(Position::new(ws_start, 0, 0), Position::new(*position, 0, 0)),
                    ));
                }

                // Newline
                b'\n' => {
                    trivia.push(TriviaToken::new(
                        Trivia::Newline,
                        Range::new(
                            Position::new(*position, 0, 0),
                            Position::new(*position + 1, 0, 0),
                        ),
                    ));
                    *position += 1;
                }

                // Comment
                b'#' => {
                    let comment_start = *position;
                    // Find end of line
                    while *position < source.len() && bytes[*position] != b'\n' {
                        *position += 1;
                    }

                    let comment = &source[comment_start..*position];
                    trivia.push(TriviaToken::new(
                        Trivia::LineComment(comment.to_string()),
                        Range::new(
                            Position::new(comment_start, 0, 0),
                            Position::new(*position, 0, 0),
                        ),
                    ));
                }

                // POD documentation
                b'=' if *position == 0 || (*position > 0 && bytes[*position - 1] == b'\n') => {
                    // Check if this starts a POD section
                    let remaining = &source[*position..];
                    if remaining.starts_with("=pod")
                        || remaining.starts_with("=head")
                        || remaining.starts_with("=over")
                        || remaining.starts_with("=item")
                        || remaining.starts_with("=back")
                        || remaining.starts_with("=begin")
                        || remaining.starts_with("=end")
                        || remaining.starts_with("=for")
                        || remaining.starts_with("=encoding")
                    {
                        let pod_start = *position;

                        // Find =cut
                        let mut found_cut = false;
                        while *position < source.len() {
                            if *position > 0
                                && bytes[*position - 1] == b'\n'
                                && source[*position..].starts_with("=cut")
                            {
                                *position += 4; // Skip "=cut"
                                // Skip to end of line
                                while *position < source.len() && bytes[*position] != b'\n' {
                                    *position += 1;
                                }
                                if *position < source.len() {
                                    *position += 1; // Skip newline
                                }
                                found_cut = true;
                                break;
                            }
                            *position += 1;
                        }

                        if !found_cut {
                            *position = source.len();
                        }

                        let pod = &source[pod_start..*position];
                        trivia.push(TriviaToken::new(
                            Trivia::PodComment(pod.to_string()),
                            Range::new(
                                Position::new(pod_start, 0, 0),
                                Position::new(*position, 0, 0),
                            ),
                        ));
                    } else {
                        // Not POD, this is a regular token
                        break;
                    }
                }

                // Non-trivia character
                _ => {
                    // Check for Unicode whitespace
                    if ch >= 128 {
                        let ch_str = &source[*position..];
                        if let Some(unicode_ch) = ch_str.chars().next() {
                            if unicode_ch.is_whitespace() {
                                let ch_len = unicode_ch.len_utf8();
                                trivia.push(TriviaToken::new(
                                    Trivia::Whitespace(unicode_ch.to_string()),
                                    Range::new(
                                        Position::new(*position, 0, 0),
                                        Position::new(*position + ch_len, 0, 0),
                                    ),
                                ));
                                *position += ch_len;
                                continue;
                            }
                        }
                    }

                    // Not trivia, stop collecting
                    break;
                }
            }
        }

        trivia
    }

    /// Get current token with trivia
    pub(crate) fn current_token(&self) -> Option<&TokenWithTrivia> {
        self.tokens.get(self.current)
    }

    /// Advance to next token
    pub(crate) fn advance(&mut self) -> Option<&TokenWithTrivia> {
        if self.current < self.tokens.len() {
            self.current += 1;
        }
        self.current_token()
    }

    /// Check if at end of tokens
    pub fn is_eof(&self) -> bool {
        self.current >= self.tokens.len()
    }
}

/// Parser that preserves trivia
pub struct TriviaPreservingParser {
    context: TriviaParserContext,
}

impl TriviaPreservingParser {
    /// Create a new trivia-preserving parser
    pub fn new(source: String) -> Self {
        TriviaPreservingParser { context: TriviaParserContext::new(source) }
    }

    /// Parse the source, preserving trivia
    pub fn parse(mut self) -> NodeWithTrivia {
        let start_pos = Position::new(0, 1, 1);
        let mut statement_nodes = Vec::new();

        // Collect any leading trivia before first statement
        let mut leading_trivia = Vec::new();
        if let Some(first_token) = self.context.current_token() {
            leading_trivia = first_token.leading_trivia.clone();
        }

        // Parse statements
        while !self.context.is_eof() {
            if let Some(stmt) = self.parse_statement() {
                statement_nodes.push(stmt.node);
            }
        }

        let end_pos = if let Some(last_token) = self.context.tokens.back() {
            last_token.range.end
        } else {
            start_pos
        };

        let program = Node::new(
            self.context.id_generator.next_id(),
            NodeKind::Program { statements: statement_nodes },
            Range::new(start_pos, end_pos),
        );

        NodeWithTrivia { node: program, leading_trivia, trailing_trivia: Vec::new() }
    }

    /// Parse a statement with trivia
    fn parse_statement(&mut self) -> Option<NodeWithTrivia> {
        let (token, leading_trivia, _token_range) = {
            let token_with_trivia = self.context.current_token()?;
            (
                token_with_trivia.token.clone(),
                token_with_trivia.leading_trivia.clone(),
                token_with_trivia.range,
            )
        };

        // Simple demonstration: parse variable declarations
        match &token.token_type {
            TokenType::Keyword(kw) if matches!(kw.as_ref(), "my" | "our" | "local" | "state") => {
                let start_pos = self.context.position_tracker.offset_to_position(token.start);

                let declarator = kw.to_string();
                self.context.advance();

                // For demonstration, create a simple node
                let end_pos = self.context.position_tracker.offset_to_position(token.end);

                let node = Node::new(
                    self.context.id_generator.next_id(),
                    NodeKind::Identifier { name: declarator },
                    Range::new(start_pos, end_pos),
                );

                // Skip to next statement for demo
                while !self.context.is_eof() {
                    if let Some(t) = self.context.current_token() {
                        if matches!(t.token.token_type, TokenType::Semicolon) {
                            self.context.advance();
                            break;
                        }
                    }
                    self.context.advance();
                }

                Some(NodeWithTrivia { node, leading_trivia, trailing_trivia: Vec::new() })
            }
            _ => {
                // Skip unknown tokens for now
                self.context.advance();
                None
            }
        }
    }
}

/// Format an AST with trivia back to source code
pub fn format_with_trivia(node: &NodeWithTrivia) -> String {
    let mut result = String::new();

    // Add leading trivia
    for trivia in &node.leading_trivia {
        result.push_str(trivia.trivia.as_str());
    }

    // Add node content (simplified)
    result.push_str(&format!("{:?}", node.node.kind));

    // Add trailing trivia
    for trivia in &node.trailing_trivia {
        result.push_str(trivia.trivia.as_str());
    }

    result
}

#[cfg(test)]
mod tests {
    use super::*;
    #[allow(unused_imports)]
    use perl_tdd_support::must_some;

    #[test]
    fn test_trivia_preservation() {
        let source = r#"#!/usr/bin/perl
# This is a comment
  
my $x = 42;  # end of line comment

=pod
This is POD documentation
=cut

our $y;"#
            .to_string();

        let parser = TriviaPreservingParser::new(source);
        let result = parser.parse();

        // Check that we have leading trivia
        assert!(!result.leading_trivia.is_empty());

        // First trivia should be the shebang comment
        assert!(matches!(
            &result.leading_trivia[0].trivia,
            Trivia::LineComment(s) if s.starts_with("#!/usr/bin/perl")
        ));
    }

    #[test]
    fn test_whitespace_preservation() {
        let source = "  \t  my $x;".to_string();
        let ctx = TriviaParserContext::new(source);

        let first_token = must_some(ctx.current_token());
        assert!(!first_token.leading_trivia.is_empty());
        assert!(matches!(
            &first_token.leading_trivia[0].trivia,
            Trivia::Whitespace(ws) if ws == "  \t  "
        ));
    }
}