use std::ops::Range;
use crate::bom::strip_leading;
use crate::cst::syntax_kind::SyntaxKind;
use crate::logos_lexer::{Token, tokenize_lossless};
const UTF8_BOM_LEN: usize = 3;
#[must_use]
pub fn lossless_kind_tokens(source: &str) -> Vec<(SyntaxKind, Range<usize>)> {
let mut out: Vec<(SyntaxKind, Range<usize>)> = Vec::new();
let (lexer_source, had_bom) = strip_leading(source);
let offset = if had_bom {
out.push((SyntaxKind::BOM, 0..UTF8_BOM_LEN));
UTF8_BOM_LEN
} else {
0
};
for (token, span) in tokenize_lossless(lexer_source) {
let start = span.start + offset;
let end = span.end + offset;
out.push((map_kind(&token), start..end));
}
out
}
const fn map_kind(token: &Token<'_>) -> SyntaxKind {
match token {
Token::Date(_) => SyntaxKind::DATE,
Token::Number(_) => SyntaxKind::NUMBER,
Token::String(_) => SyntaxKind::STRING,
Token::Account(_) => SyntaxKind::ACCOUNT,
Token::Currency(_) => SyntaxKind::CURRENCY,
Token::Tag(_) => SyntaxKind::TAG,
Token::Link(_) => SyntaxKind::LINK,
Token::MetaKey(_) => SyntaxKind::META_KEY,
Token::Flag(_) => SyntaxKind::FLAG,
Token::True => SyntaxKind::BOOL_TRUE,
Token::False => SyntaxKind::BOOL_FALSE,
Token::Null => SyntaxKind::NULL_KW,
Token::Txn => SyntaxKind::TXN_KW,
Token::Balance => SyntaxKind::BALANCE_KW,
Token::Open => SyntaxKind::OPEN_KW,
Token::Close => SyntaxKind::CLOSE_KW,
Token::Commodity => SyntaxKind::COMMODITY_KW,
Token::Pad => SyntaxKind::PAD_KW,
Token::Event => SyntaxKind::EVENT_KW,
Token::Query => SyntaxKind::QUERY_KW,
Token::Note => SyntaxKind::NOTE_KW,
Token::Document => SyntaxKind::DOCUMENT_KW,
Token::Price => SyntaxKind::PRICE_KW,
Token::Custom => SyntaxKind::CUSTOM_KW,
Token::Option_ => SyntaxKind::OPTION_KW,
Token::Include => SyntaxKind::INCLUDE_KW,
Token::Plugin => SyntaxKind::PLUGIN_KW,
Token::Pushtag => SyntaxKind::PUSHTAG_KW,
Token::Poptag => SyntaxKind::POPTAG_KW,
Token::Pushmeta => SyntaxKind::PUSHMETA_KW,
Token::Popmeta => SyntaxKind::POPMETA_KW,
Token::Pending => SyntaxKind::PENDING_KW,
Token::LBrace => SyntaxKind::L_BRACE,
Token::RBrace => SyntaxKind::R_BRACE,
Token::LDoubleBrace => SyntaxKind::L_DOUBLE_BRACE,
Token::RDoubleBrace => SyntaxKind::R_DOUBLE_BRACE,
Token::LBraceHash => SyntaxKind::L_BRACE_HASH,
Token::LParen => SyntaxKind::L_PAREN,
Token::RParen => SyntaxKind::R_PAREN,
Token::At => SyntaxKind::AT,
Token::AtAt => SyntaxKind::AT_AT,
Token::Colon => SyntaxKind::COLON,
Token::Comma => SyntaxKind::COMMA,
Token::Tilde => SyntaxKind::TILDE,
Token::Pipe => SyntaxKind::PIPE,
Token::Plus => SyntaxKind::PLUS,
Token::Minus => SyntaxKind::MINUS,
Token::Star => SyntaxKind::STAR,
Token::Slash => SyntaxKind::SLASH,
Token::Hash => SyntaxKind::HASH,
Token::Whitespace(_) => SyntaxKind::WHITESPACE,
Token::Newline => SyntaxKind::NEWLINE,
Token::Comment(_) => SyntaxKind::COMMENT,
Token::PercentComment(_) => SyntaxKind::PERCENT_COMMENT,
Token::Shebang(_) => SyntaxKind::SHEBANG,
Token::EmacsDirective(_) => SyntaxKind::EMACS_DIRECTIVE,
Token::Indent(_) | Token::DeepIndent(_) => SyntaxKind::WHITESPACE,
Token::Error(_) => SyntaxKind::ERROR_TOKEN,
}
}
#[cfg(test)]
mod tests {
use super::*;
fn assert_tiles(source: &str, entries: &[(SyntaxKind, Range<usize>)]) {
let mut cursor = 0usize;
for (i, (_kind, range)) in entries.iter().enumerate() {
assert_eq!(
range.start, cursor,
"entry {i} starts at {} but cursor is {cursor}",
range.start,
);
assert!(range.end >= range.start, "entry {i}: end < start");
cursor = range.end;
}
assert_eq!(
cursor,
source.len(),
"entries cover {cursor} of {} bytes",
source.len(),
);
}
#[test]
fn empty_source() {
let entries = lossless_kind_tokens("");
assert!(entries.is_empty());
assert_tiles("", &entries);
}
#[test]
fn whitespace_only() {
let source = " \t ";
let entries = lossless_kind_tokens(source);
assert_tiles(source, &entries);
assert!(
entries.iter().all(|(k, _)| *k == SyntaxKind::WHITESPACE),
"whitespace-only source should produce only WHITESPACE tokens",
);
}
#[test]
fn bom_recovered_as_first_token() {
let source = "\u{FEFF}2024-01-01 open Assets:Bank\n";
let entries = lossless_kind_tokens(source);
assert_eq!(entries[0].0, SyntaxKind::BOM);
assert_eq!(entries[0].1, 0..UTF8_BOM_LEN);
assert_tiles(source, &entries);
}
#[test]
fn directive_tiles_source() {
let source = "2024-01-01 open Assets:Bank USD\n2024-01-15 * \"Coffee\"\n Assets:Bank -5.00 USD\n Expenses:Food\n";
let entries = lossless_kind_tokens(source);
assert_tiles(source, &entries);
let reconstructed: String = entries.iter().map(|(_, r)| &source[r.clone()]).collect();
assert_eq!(reconstructed, source);
}
}