use std::ops::Range;
use rowan::GreenNodeBuilder;
use crate::cst::lossless_tokens::lossless_kind_tokens;
use crate::cst::syntax_kind::{SyntaxKind, SyntaxNode};
#[must_use]
pub fn parse_flat(source: &str) -> SyntaxNode {
let mut builder = GreenNodeBuilder::new();
builder.start_node(SyntaxKind::SOURCE_FILE.into());
for (kind, range) in lossless_kind_tokens(source) {
builder.token(kind.into(), &source[range]);
}
builder.finish_node();
SyntaxNode::new_root(builder.finish())
}
#[must_use]
pub fn parse_structured(source: &str) -> SyntaxNode {
let tokens: Vec<(SyntaxKind, Range<usize>)> = lossless_kind_tokens(source);
let mut builder = GreenNodeBuilder::new();
builder.start_node(SyntaxKind::SOURCE_FILE.into());
let mut pending_leading: Vec<(SyntaxKind, Range<usize>)> = Vec::new();
let mut seen_first_content = false;
let mut i = 0;
while i < tokens.len() {
let (kind, ref range) = tokens[i];
if kind.is_trivia() {
pending_leading.push((kind, range.clone()));
i += 1;
continue;
}
let node_kind = identify_directive(&tokens, i).unwrap_or(SyntaxKind::ERROR_NODE);
if seen_first_content {
builder.start_node(node_kind.into());
emit_tokens(&mut builder, source, std::mem::take(&mut pending_leading));
} else {
emit_tokens(&mut builder, source, std::mem::take(&mut pending_leading));
builder.start_node(node_kind.into());
}
seen_first_content = true;
i = match node_kind {
SyntaxKind::TRANSACTION => emit_transaction_body(&mut builder, source, &tokens, i),
SyntaxKind::ERROR_NODE => emit_through_terminator(&mut builder, source, &tokens, i),
_ => emit_directive_body(&mut builder, source, &tokens, i),
};
builder.finish_node();
}
emit_tokens(&mut builder, source, std::mem::take(&mut pending_leading));
builder.finish_node();
SyntaxNode::new_root(builder.finish())
}
fn emit_tokens(
builder: &mut GreenNodeBuilder<'_>,
source: &str,
tokens: impl IntoIterator<Item = (SyntaxKind, Range<usize>)>,
) {
for (kind, range) in tokens {
builder.token(kind.into(), &source[range]);
}
}
fn emit_through_terminator(
builder: &mut GreenNodeBuilder<'_>,
source: &str,
tokens: &[(SyntaxKind, Range<usize>)],
mut i: usize,
) -> usize {
while i < tokens.len() {
let (kind, ref range) = tokens[i];
builder.token(kind.into(), &source[range.clone()]);
i += 1;
if kind == SyntaxKind::NEWLINE {
break;
}
}
i
}
fn emit_body_sub_line(
builder: &mut GreenNodeBuilder<'_>,
source: &str,
tokens: &[(SyntaxKind, Range<usize>)],
i: usize,
) -> usize {
if starts_meta_sub_line(tokens, i) {
builder.start_node(SyntaxKind::META_ENTRY.into());
let next = emit_through_terminator(builder, source, tokens, i);
builder.finish_node();
next
} else {
emit_through_terminator(builder, source, tokens, i)
}
}
fn starts_meta_sub_line(tokens: &[(SyntaxKind, Range<usize>)], i: usize) -> bool {
matches!(tokens.get(i), Some((SyntaxKind::WHITESPACE, _)))
&& matches!(tokens.get(i + 1), Some((SyntaxKind::META_KEY, _)))
}
fn emit_directive_body(
builder: &mut GreenNodeBuilder<'_>,
source: &str,
tokens: &[(SyntaxKind, Range<usize>)],
mut i: usize,
) -> usize {
i = emit_through_terminator(builder, source, tokens, i);
let block_has_meta = upcoming_indented_block_has_meta(tokens, i);
while is_indented_directive_continuation(tokens, i, block_has_meta) {
i = emit_body_sub_line(builder, source, tokens, i);
}
i
}
fn emit_transaction_body(
builder: &mut GreenNodeBuilder<'_>,
source: &str,
tokens: &[(SyntaxKind, Range<usize>)],
mut i: usize,
) -> usize {
i = emit_through_terminator(builder, source, tokens, i);
let mut open_posting_indent: Option<usize> = None;
while is_indented_transaction_body_line(tokens, i) {
let sub_line_indent = indent_width(tokens, i);
if starts_posting_sub_line(tokens, i) {
if open_posting_indent.is_some() {
builder.finish_node();
}
builder.start_node(SyntaxKind::POSTING.into());
open_posting_indent = Some(sub_line_indent);
i = emit_posting_line(builder, source, tokens, i);
} else if starts_meta_sub_line(tokens, i) {
close_open_posting_unless_attached(
builder,
&mut open_posting_indent,
sub_line_indent,
true,
);
i = emit_body_sub_line(builder, source, tokens, i);
} else if starts_indented_comment(tokens, i) {
close_open_posting_unless_attached(
builder,
&mut open_posting_indent,
sub_line_indent,
false,
);
i = emit_through_terminator(builder, source, tokens, i);
} else {
if open_posting_indent.is_some() {
builder.finish_node();
open_posting_indent = None;
}
i = emit_through_terminator(builder, source, tokens, i);
}
}
if open_posting_indent.is_some() {
builder.finish_node();
}
i
}
fn emit_posting_line(
builder: &mut GreenNodeBuilder<'_>,
source: &str,
tokens: &[(SyntaxKind, Range<usize>)],
mut i: usize,
) -> usize {
if let Some((SyntaxKind::WHITESPACE, range)) = tokens.get(i) {
builder.token(SyntaxKind::WHITESPACE.into(), &source[range.clone()]);
i += 1;
}
let next = tokens.get(i).map(|(k, _)| *k);
let is_flag = match next {
Some(SyntaxKind::FLAG | SyntaxKind::STAR | SyntaxKind::PENDING_KW | SyntaxKind::HASH) => {
true
}
Some(SyntaxKind::CURRENCY) => tokens[i].1.len() == 1,
_ => false,
};
if is_flag {
if let Some((kind, range)) = tokens.get(i) {
builder.token((*kind).into(), &source[range.clone()]);
i += 1;
}
if let Some((SyntaxKind::WHITESPACE, range)) = tokens.get(i) {
builder.token(SyntaxKind::WHITESPACE.into(), &source[range.clone()]);
i += 1;
}
}
if let Some((SyntaxKind::ACCOUNT, range)) = tokens.get(i) {
builder.token(SyntaxKind::ACCOUNT.into(), &source[range.clone()]);
i += 1;
}
while i < tokens.len() {
let (kind, range) = (tokens[i].0, tokens[i].1.clone());
if kind == SyntaxKind::NEWLINE {
builder.token(kind.into(), &source[range]);
i += 1;
break;
}
if starts_amount(tokens, i) {
i = emit_amount(builder, source, tokens, i);
continue;
}
if matches!(
kind,
SyntaxKind::L_BRACE | SyntaxKind::L_BRACE_HASH | SyntaxKind::L_DOUBLE_BRACE,
) {
i = emit_cost_spec(builder, source, tokens, i);
continue;
}
if matches!(kind, SyntaxKind::AT | SyntaxKind::AT_AT) {
i = emit_price_annotation(builder, source, tokens, i);
continue;
}
builder.token(kind.into(), &source[range]);
i += 1;
}
i
}
fn starts_amount(tokens: &[(SyntaxKind, Range<usize>)], i: usize) -> bool {
match tokens.get(i).map(|(k, _)| *k) {
Some(SyntaxKind::NUMBER | SyntaxKind::CURRENCY | SyntaxKind::L_PAREN) => true,
Some(SyntaxKind::MINUS | SyntaxKind::PLUS) => matches!(
tokens.get(i + 1).map(|(k, _)| *k),
Some(SyntaxKind::NUMBER | SyntaxKind::L_PAREN),
),
_ => false,
}
}
const fn is_arith_op(kind: SyntaxKind) -> bool {
matches!(
kind,
SyntaxKind::PLUS | SyntaxKind::MINUS | SyntaxKind::STAR | SyntaxKind::SLASH,
)
}
fn emit_amount(
builder: &mut GreenNodeBuilder<'_>,
source: &str,
tokens: &[(SyntaxKind, Range<usize>)],
mut i: usize,
) -> usize {
builder.start_node(SyntaxKind::AMOUNT.into());
if matches!(tokens.get(i).map(|(k, _)| *k), Some(SyntaxKind::CURRENCY))
&& !starts_amount_operand(tokens, i + 1)
{
let range = tokens[i].1.clone();
builder.token(SyntaxKind::CURRENCY.into(), &source[range]);
i += 1;
builder.finish_node();
return i;
}
if matches!(
tokens.get(i).map(|(k, _)| *k),
Some(SyntaxKind::MINUS | SyntaxKind::PLUS),
) {
let (kind, range) = (tokens[i].0, tokens[i].1.clone());
builder.token(kind.into(), &source[range]);
i += 1;
}
i = emit_amount_operand(builder, source, tokens, i);
loop {
let mut j = i;
if matches!(tokens.get(j).map(|(k, _)| *k), Some(SyntaxKind::WHITESPACE)) {
j += 1;
}
let Some((op_kind, _)) = tokens.get(j) else {
break;
};
if !is_arith_op(*op_kind) {
break;
}
let op_kind = *op_kind;
j += 1;
if matches!(tokens.get(j).map(|(k, _)| *k), Some(SyntaxKind::WHITESPACE)) {
j += 1;
}
let signed = matches!(
tokens.get(j).map(|(k, _)| *k),
Some(SyntaxKind::MINUS | SyntaxKind::PLUS),
);
let operand_start = if signed { j + 1 } else { j };
if !starts_amount_operand(tokens, operand_start) {
break;
}
while i < j {
let (kind, range) = (tokens[i].0, tokens[i].1.clone());
debug_assert!(
kind == SyntaxKind::WHITESPACE || kind == op_kind || is_arith_op(kind),
"unexpected token kind {kind:?} during op-prefix commit",
);
builder.token(kind.into(), &source[range]);
i += 1;
}
if signed {
let (kind, range) = (tokens[i].0, tokens[i].1.clone());
builder.token(kind.into(), &source[range]);
i += 1;
}
i = emit_amount_operand(builder, source, tokens, i);
}
if matches!(tokens.get(i).map(|(k, _)| *k), Some(SyntaxKind::WHITESPACE))
&& matches!(
tokens.get(i + 1).map(|(k, _)| *k),
Some(SyntaxKind::CURRENCY),
)
{
let ws_range = tokens[i].1.clone();
builder.token(SyntaxKind::WHITESPACE.into(), &source[ws_range]);
i += 1;
let cur_range = tokens[i].1.clone();
builder.token(SyntaxKind::CURRENCY.into(), &source[cur_range]);
i += 1;
} else if matches!(tokens.get(i).map(|(k, _)| *k), Some(SyntaxKind::CURRENCY)) {
let cur_range = tokens[i].1.clone();
builder.token(SyntaxKind::CURRENCY.into(), &source[cur_range]);
i += 1;
}
builder.finish_node();
i
}
fn starts_amount_operand(tokens: &[(SyntaxKind, Range<usize>)], i: usize) -> bool {
matches!(
tokens.get(i).map(|(k, _)| *k),
Some(SyntaxKind::NUMBER | SyntaxKind::L_PAREN),
)
}
fn emit_amount_operand(
builder: &mut GreenNodeBuilder<'_>,
source: &str,
tokens: &[(SyntaxKind, Range<usize>)],
mut i: usize,
) -> usize {
match tokens.get(i).map(|(k, _)| *k) {
Some(SyntaxKind::NUMBER) => {
let range = tokens[i].1.clone();
builder.token(SyntaxKind::NUMBER.into(), &source[range]);
i += 1;
}
Some(SyntaxKind::L_PAREN) => {
let range = tokens[i].1.clone();
builder.token(SyntaxKind::L_PAREN.into(), &source[range]);
i += 1;
let mut depth = 1usize;
while depth > 0 {
let Some((kind, range)) = tokens.get(i) else {
break;
};
let (kind, range) = (*kind, range.clone());
if kind == SyntaxKind::NEWLINE {
break;
}
builder.token(kind.into(), &source[range]);
i += 1;
match kind {
SyntaxKind::L_PAREN => depth += 1,
SyntaxKind::R_PAREN => depth -= 1,
_ => {}
}
}
}
_ => {}
}
i
}
fn emit_cost_spec(
builder: &mut GreenNodeBuilder<'_>,
source: &str,
tokens: &[(SyntaxKind, Range<usize>)],
mut i: usize,
) -> usize {
builder.start_node(SyntaxKind::COST_SPEC.into());
if let Some((kind, range)) = tokens.get(i) {
builder.token((*kind).into(), &source[range.clone()]);
i += 1;
}
while i < tokens.len() {
let (kind, range) = (tokens[i].0, tokens[i].1.clone());
if kind == SyntaxKind::NEWLINE {
break;
}
builder.token(kind.into(), &source[range]);
i += 1;
if matches!(kind, SyntaxKind::R_BRACE | SyntaxKind::R_DOUBLE_BRACE) {
break;
}
}
builder.finish_node();
i
}
fn emit_price_annotation(
builder: &mut GreenNodeBuilder<'_>,
source: &str,
tokens: &[(SyntaxKind, Range<usize>)],
mut i: usize,
) -> usize {
builder.start_node(SyntaxKind::PRICE_ANNOTATION.into());
if let Some((kind, range)) = tokens.get(i) {
builder.token((*kind).into(), &source[range.clone()]);
i += 1;
}
let ws_then_amount = matches!(tokens.get(i).map(|(k, _)| *k), Some(SyntaxKind::WHITESPACE),)
&& starts_amount(tokens, i + 1);
if ws_then_amount {
let ws_range = tokens[i].1.clone();
builder.token(SyntaxKind::WHITESPACE.into(), &source[ws_range]);
i += 1;
}
if starts_amount(tokens, i) {
i = emit_amount(builder, source, tokens, i);
}
builder.finish_node();
i
}
fn close_open_posting_unless_attached(
builder: &mut GreenNodeBuilder<'_>,
open_posting_indent: &mut Option<usize>,
sub_line_indent: usize,
attach_on_equal: bool,
) {
let attach = open_posting_indent.is_some_and(|p_indent| {
if attach_on_equal {
sub_line_indent >= p_indent
} else {
sub_line_indent > p_indent
}
});
if !attach && open_posting_indent.is_some() {
builder.finish_node();
*open_posting_indent = None;
}
}
fn starts_posting_sub_line(tokens: &[(SyntaxKind, Range<usize>)], i: usize) -> bool {
if !matches!(tokens.get(i), Some((SyntaxKind::WHITESPACE, _))) {
return false;
}
if matches!(tokens.get(i + 1), Some((SyntaxKind::ACCOUNT, _))) {
return true;
}
let has_flag = match tokens.get(i + 1) {
Some((
SyntaxKind::FLAG | SyntaxKind::STAR | SyntaxKind::PENDING_KW | SyntaxKind::HASH,
_,
)) => true,
Some((SyntaxKind::CURRENCY, range)) => range.len() == 1,
_ => false,
};
if !has_flag {
return false;
}
matches!(tokens.get(i + 2), Some((SyntaxKind::WHITESPACE, _)))
&& matches!(tokens.get(i + 3), Some((SyntaxKind::ACCOUNT, _)))
}
fn indent_width(tokens: &[(SyntaxKind, Range<usize>)], i: usize) -> usize {
match tokens.get(i) {
Some((SyntaxKind::WHITESPACE, range)) => range.len(),
_ => 0,
}
}
const fn is_comment_token(kind: SyntaxKind) -> bool {
matches!(
kind,
SyntaxKind::COMMENT
| SyntaxKind::PERCENT_COMMENT
| SyntaxKind::SHEBANG
| SyntaxKind::EMACS_DIRECTIVE,
)
}
fn starts_indented_comment(tokens: &[(SyntaxKind, Range<usize>)], i: usize) -> bool {
matches!(tokens.get(i), Some((SyntaxKind::WHITESPACE, _)))
&& matches!(tokens.get(i + 1), Some((k, _)) if is_comment_token(*k))
}
fn is_indented_transaction_body_line(tokens: &[(SyntaxKind, Range<usize>)], i: usize) -> bool {
if !matches!(tokens.get(i), Some((SyntaxKind::WHITESPACE, _))) {
return false;
}
!matches!(tokens.get(i + 1), Some((SyntaxKind::NEWLINE, _)) | None)
}
fn upcoming_indented_block_has_meta(tokens: &[(SyntaxKind, Range<usize>)], mut i: usize) -> bool {
loop {
let head = tokens.get(i).map(|(k, _)| *k);
let next = tokens.get(i + 1).map(|(k, _)| *k);
match (head, next) {
(Some(SyntaxKind::WHITESPACE), Some(SyntaxKind::META_KEY)) => return true,
(Some(SyntaxKind::WHITESPACE), Some(k)) if is_comment_token(k) => {
while i < tokens.len() && tokens[i].0 != SyntaxKind::NEWLINE {
i += 1;
}
if i >= tokens.len() {
return false;
}
i += 1; }
_ => return false,
}
}
}
fn is_indented_directive_continuation(
tokens: &[(SyntaxKind, Range<usize>)],
i: usize,
block_has_meta: bool,
) -> bool {
if starts_meta_sub_line(tokens, i) {
return true;
}
if !matches!(tokens.get(i), Some((SyntaxKind::WHITESPACE, _))) {
return false;
}
match tokens.get(i + 1) {
Some((k, _)) if is_comment_token(*k) => block_has_meta,
_ => false,
}
}
fn identify_directive(tokens: &[(SyntaxKind, Range<usize>)], i: usize) -> Option<SyntaxKind> {
let (head, _) = tokens.get(i)?;
match *head {
SyntaxKind::PUSHTAG_KW => Some(SyntaxKind::PUSHTAG_DIRECTIVE),
SyntaxKind::POPTAG_KW => Some(SyntaxKind::POPTAG_DIRECTIVE),
SyntaxKind::PUSHMETA_KW => Some(SyntaxKind::PUSHMETA_DIRECTIVE),
SyntaxKind::POPMETA_KW => Some(SyntaxKind::POPMETA_DIRECTIVE),
SyntaxKind::OPTION_KW => Some(SyntaxKind::OPTION_DIRECTIVE),
SyntaxKind::INCLUDE_KW => Some(SyntaxKind::INCLUDE_DIRECTIVE),
SyntaxKind::PLUGIN_KW => Some(SyntaxKind::PLUGIN_DIRECTIVE),
SyntaxKind::DATE => {
let mut j = i + 1;
while j < tokens.len() && tokens[j].0 == SyntaxKind::WHITESPACE {
j += 1;
}
let (next, _) = tokens.get(j)?;
match *next {
SyntaxKind::OPEN_KW => Some(SyntaxKind::OPEN_DIRECTIVE),
SyntaxKind::CLOSE_KW => Some(SyntaxKind::CLOSE_DIRECTIVE),
SyntaxKind::BALANCE_KW => Some(SyntaxKind::BALANCE_DIRECTIVE),
SyntaxKind::PAD_KW => Some(SyntaxKind::PAD_DIRECTIVE),
SyntaxKind::EVENT_KW => Some(SyntaxKind::EVENT_DIRECTIVE),
SyntaxKind::QUERY_KW => Some(SyntaxKind::QUERY_DIRECTIVE),
SyntaxKind::NOTE_KW => Some(SyntaxKind::NOTE_DIRECTIVE),
SyntaxKind::DOCUMENT_KW => Some(SyntaxKind::DOCUMENT_DIRECTIVE),
SyntaxKind::PRICE_KW => Some(SyntaxKind::PRICE_DIRECTIVE),
SyntaxKind::COMMODITY_KW => Some(SyntaxKind::COMMODITY_DIRECTIVE),
SyntaxKind::CUSTOM_KW => Some(SyntaxKind::CUSTOM_DIRECTIVE),
SyntaxKind::STAR
| SyntaxKind::PENDING_KW
| SyntaxKind::FLAG
| SyntaxKind::HASH
| SyntaxKind::TXN_KW
| SyntaxKind::STRING => Some(SyntaxKind::TRANSACTION),
SyntaxKind::CURRENCY if tokens[j].1.len() == 1 => Some(SyntaxKind::TRANSACTION),
_ => None,
}
}
_ => None,
}
}
#[cfg(test)]
mod tests {
use super::*;
fn assert_round_trips(source: &str) {
let tree = parse_flat(source);
assert_eq!(tree.text().to_string(), source);
let structured = parse_structured(source);
assert_eq!(structured.text().to_string(), source);
}
#[test]
fn is_comment_token_covers_all_comment_class_trivia() {
let non_comment_trivia = [SyntaxKind::BOM, SyntaxKind::WHITESPACE, SyntaxKind::NEWLINE];
let mut trivia_missed_from_comment: Vec<SyntaxKind> = Vec::new();
let mut comment_not_trivia: Vec<SyntaxKind> = Vec::new();
for d in 0u16..=u16::MAX {
let Ok(kind) = SyntaxKind::try_from(d) else {
continue;
};
if kind.is_trivia() && !non_comment_trivia.contains(&kind) && !is_comment_token(kind) {
trivia_missed_from_comment.push(kind);
}
if is_comment_token(kind) && !kind.is_trivia() {
comment_not_trivia.push(kind);
}
}
assert!(
trivia_missed_from_comment.is_empty(),
"trivia kinds present in is_trivia() but missing from \
is_comment_token: {trivia_missed_from_comment:?}. Three \
options: (a) add them to is_comment_token if they are \
comment-class; (b) extend the non_comment_trivia allow- \
list in this test if they are whitespace-class; (c) if \
they are neither, revisit whether the body-continuation \
predicates need a different abstraction and propagate \
the decision to the three call sites.",
);
assert!(
comment_not_trivia.is_empty(),
"is_comment_token claims these kinds are comments but \
is_trivia() disagrees: {comment_not_trivia:?}. Either \
add them to is_trivia() (if they really are trivia) or \
remove them from is_comment_token (if they are content \
tokens that should not be absorbed as comment \
continuations).",
);
}
#[test]
fn empty_source() {
assert_round_trips("");
}
#[test]
fn whitespace_only() {
assert_round_trips(" \t ");
}
#[test]
fn bom_round_trips() {
assert_round_trips("\u{FEFF}2024-01-01 open Assets:Bank\n");
}
#[test]
fn full_directive_round_trips() {
assert_round_trips(
"2024-01-01 open Assets:Bank USD\n\
2024-01-15 * \"Coffee\"\n \
Assets:Bank -5.00 USD\n \
Expenses:Food\n",
);
}
#[test]
fn line_comment_round_trips() {
assert_round_trips("; preamble\n2024-01-01 open Assets:Bank\n");
}
#[test]
fn no_trailing_newline_round_trips() {
assert_round_trips("2024-01-01 open Assets:Bank");
}
#[test]
fn root_kind_is_source_file() {
let tree = parse_flat("");
assert_eq!(tree.kind(), SyntaxKind::SOURCE_FILE);
let structured = parse_structured("");
assert_eq!(structured.kind(), SyntaxKind::SOURCE_FILE);
}
}