use perl_ast_v2::{Node, NodeKind};
use perl_lexer::TokenType;
use perl_position_tracking::Range;
#[derive(Debug, Clone, PartialEq)]
pub enum Trivia {
Whitespace(String),
LineComment(String),
PodComment(String),
Newline,
}
impl Trivia {
pub fn as_str(&self) -> &str {
match self {
Trivia::Whitespace(s) => s,
Trivia::LineComment(s) => s,
Trivia::PodComment(s) => s,
Trivia::Newline => "\n",
}
}
pub fn kind_name(&self) -> &'static str {
match self {
Trivia::Whitespace(_) => "whitespace",
Trivia::LineComment(_) => "comment",
Trivia::PodComment(_) => "pod",
Trivia::Newline => "newline",
}
}
}
#[derive(Debug, Clone)]
pub struct NodeWithTrivia {
pub node: Node,
pub leading_trivia: Vec<TriviaToken>,
pub trailing_trivia: Vec<TriviaToken>,
}
#[derive(Debug, Clone)]
pub struct TriviaToken {
pub trivia: Trivia,
pub range: Range,
}
impl TriviaToken {
pub fn new(trivia: Trivia, range: Range) -> Self {
TriviaToken { trivia, range }
}
}
pub trait TriviaCollector {
fn collect_leading_trivia(&mut self) -> Vec<TriviaToken>;
fn collect_trailing_trivia(&mut self) -> Vec<TriviaToken>;
}
pub struct TriviaLexer {
lexer: perl_lexer::PerlLexer<'static>,
source: String,
position: usize,
_trivia_buffer: Vec<TriviaToken>,
}
impl TriviaLexer {
pub fn new(source: String) -> Self {
let source_ref: &'static str = Box::leak(source.clone().into_boxed_str());
TriviaLexer {
lexer: perl_lexer::PerlLexer::new(source_ref),
source,
position: 0,
_trivia_buffer: Vec::new(),
}
}
pub fn next_token_with_trivia(&mut self) -> Option<(perl_lexer::Token, Vec<TriviaToken>)> {
let trivia = self.collect_trivia();
let token = self.lexer.next_token()?;
self.position = self.position.max(token.end);
if matches!(token.token_type, TokenType::EOF) {
if !trivia.is_empty() {
return Some((token, trivia));
}
return None;
}
Some((token, trivia))
}
fn collect_trivia(&mut self) -> Vec<TriviaToken> {
let mut trivia = Vec::new();
while self.position < self.source.len() {
let remaining = &self.source[self.position..];
if let Some(ws_len) = self.whitespace_length(remaining) {
let ws = &remaining[..ws_len];
let start = self.position;
let end = start + ws_len;
if ws.chars().all(|c| c == '\n' || c == '\r') {
trivia.push(TriviaToken::new(
Trivia::Newline,
Range::new(
perl_position_tracking::Position::new(start, 0, 0),
perl_position_tracking::Position::new(end, 0, 0),
),
));
} else {
trivia.push(TriviaToken::new(
Trivia::Whitespace(ws.to_string()),
Range::new(
perl_position_tracking::Position::new(start, 0, 0),
perl_position_tracking::Position::new(end, 0, 0),
),
));
}
self.position += ws_len;
continue;
}
if remaining.starts_with('#') {
let comment_end = remaining.find('\n').unwrap_or(remaining.len());
let comment = &remaining[..comment_end];
let start = self.position;
let end = start + comment_end;
trivia.push(TriviaToken::new(
Trivia::LineComment(comment.to_string()),
Range::new(
perl_position_tracking::Position::new(start, 0, 0),
perl_position_tracking::Position::new(end, 0, 0),
),
));
self.position += comment_end;
continue;
}
if remaining.starts_with("=")
&& (self.position == 0 || self.source.as_bytes()[self.position - 1] == b'\n')
{
if let Some(pod_end) = self.find_pod_end(remaining) {
let pod = &remaining[..pod_end];
let start = self.position;
let end = start + pod_end;
trivia.push(TriviaToken::new(
Trivia::PodComment(pod.to_string()),
Range::new(
perl_position_tracking::Position::new(start, 0, 0),
perl_position_tracking::Position::new(end, 0, 0),
),
));
self.position += pod_end;
continue;
}
}
break;
}
if self.position > 0 {
}
trivia
}
fn whitespace_length(&self, s: &str) -> Option<usize> {
let mut len = 0;
for ch in s.chars() {
if ch.is_whitespace() && ch != '\n' && ch != '\r' {
len += ch.len_utf8();
} else if ch == '\n' || ch == '\r' {
len += ch.len_utf8();
if ch == '\r' && s[len..].starts_with('\n') {
len += 1;
}
break;
} else {
break;
}
}
if len > 0 { Some(len) } else { None }
}
fn find_pod_end(&self, s: &str) -> Option<usize> {
let mut pos = 0;
for line in s.lines() {
if line.trim() == "=cut" {
return Some(pos + line.len());
}
pos += line.len() + 1; }
Some(s.len())
}
}
pub struct TriviaPreservingParser {
lexer: TriviaLexer,
current: Option<(perl_lexer::Token, Vec<TriviaToken>)>,
id_generator: perl_ast_v2::NodeIdGenerator,
}
impl TriviaPreservingParser {
pub fn new(source: String) -> Self {
let mut parser = TriviaPreservingParser {
lexer: TriviaLexer::new(source),
current: None,
id_generator: perl_ast_v2::NodeIdGenerator::new(),
};
parser.advance();
parser
}
fn advance(&mut self) {
self.current = self.lexer.next_token_with_trivia();
}
pub fn parse(mut self) -> NodeWithTrivia {
let leading_trivia =
if let Some((_, trivia)) = &self.current { trivia.clone() } else { Vec::new() };
let node = Node::new(
self.id_generator.next_id(),
NodeKind::Program { statements: Vec::new() },
Range::new(
perl_position_tracking::Position::new(0, 1, 1),
perl_position_tracking::Position::new(0, 1, 1),
),
);
NodeWithTrivia { node, leading_trivia, trailing_trivia: Vec::new() }
}
}
#[cfg(test)]
mod tests {
use super::*;
use perl_tdd_support::must_some;
#[test]
fn test_trivia_collection() {
let source = " # comment\n my $x = 42;".to_string();
let mut lexer = TriviaLexer::new(source);
let (_token, trivia) = must_some(lexer.next_token_with_trivia());
eprintln!("Trivia count: {}", trivia.len());
for (i, t) in trivia.iter().enumerate() {
eprintln!("Trivia[{}]: {:?}", i, t.trivia);
}
assert!(trivia.len() >= 2); assert!(trivia.iter().any(|t| matches!(&t.trivia, Trivia::Whitespace(_))));
assert!(trivia.iter().any(|t| matches!(&t.trivia, Trivia::LineComment(_))));
}
#[test]
fn test_pod_preservation() {
let source = "=head1 NAME\n\nTest\n\n=cut\n\nmy $x;".to_string();
let mut lexer = TriviaLexer::new(source);
let (_, trivia) = must_some(lexer.next_token_with_trivia());
assert!(trivia.iter().any(|t| matches!(&t.trivia, Trivia::PodComment(_))));
}
}