use super::core::Token;
pub trait ToLexString {
fn to_lex_string(&self) -> String;
}
impl ToLexString for Token {
fn to_lex_string(&self) -> String {
match self {
Token::LexMarker => "::".to_string(),
Token::Indentation => " ".to_string(),
Token::Whitespace(count) => " ".repeat(*count),
Token::BlankLine(s) => s.as_deref().unwrap_or("\n").to_string(),
Token::Dash => "-".to_string(),
Token::Period => ".".to_string(),
Token::OpenParen => "(".to_string(),
Token::CloseParen => ")".to_string(),
Token::Colon => ":".to_string(),
Token::ExclamationMark => "!".to_string(),
Token::QuestionMark => "?".to_string(),
Token::Semicolon => ";".to_string(),
Token::InvertedExclamationMark => "¡".to_string(),
Token::InvertedQuestionMark => "¿".to_string(),
Token::Ellipsis => "…".to_string(),
Token::IdeographicFullStop => "。".to_string(),
Token::FullwidthExclamationMark => "!".to_string(),
Token::FullwidthQuestionMark => "?".to_string(),
Token::ExclamationQuestionMark => "⁉".to_string(),
Token::QuestionExclamationMark => "⁈".to_string(),
Token::ArabicQuestionMark => "؟".to_string(),
Token::ArabicFullStop => "۔".to_string(),
Token::ArabicTripleDot => "؍".to_string(),
Token::ArabicComma => "،".to_string(),
Token::Danda => "।".to_string(),
Token::DoubleDanda => "॥".to_string(),
Token::BengaliCurrencyNumeratorFour => "৷".to_string(),
Token::EthiopianFullStop => "።".to_string(),
Token::ArmenianFullStop => "։".to_string(),
Token::TibetanShad => "།".to_string(),
Token::ThaiFongman => "๏".to_string(),
Token::MyanmarComma => "၊".to_string(),
Token::MyanmarFullStop => "။".to_string(),
Token::Comma => ",".to_string(),
Token::Quote => "\"".to_string(),
Token::Equals => "=".to_string(),
Token::Number(s) => s.clone(),
Token::Text(s) => s.clone(),
Token::Indent(_) | Token::Dedent(_) => String::new(),
}
}
}
pub fn detokenize(tokens: &[Token]) -> String {
let mut result = String::new();
let mut indent_level = 0;
for token in tokens {
match token {
Token::Indent(_) => indent_level += 1,
Token::Dedent(_) => indent_level -= 1,
Token::BlankLine(_) => {
result.push_str(&token.to_lex_string());
}
_ => {
if result.ends_with('\n') || result.is_empty() {
for _ in 0..indent_level {
result.push_str(" ");
}
}
result.push_str(&token.to_lex_string());
}
}
}
result
}
#[cfg(test)]
mod tests {
use super::*;
use crate::lex::lexing::{ensure_source_ends_with_newline, lex, tokenize};
use crate::lex::testing::lexplore::specfile_finder::{self, DocumentType, ElementType};
use std::fs;
use std::path::{Path, PathBuf};
#[test]
fn to_lex_string_maps_every_literal_token() {
let cases: Vec<(Token, &str)> = vec![
(Token::LexMarker, "::"),
(Token::Indentation, " "),
(Token::Whitespace(1), " "),
(Token::BlankLine(Some("\n".to_string())), "\n"),
(Token::Dash, "-"),
(Token::Period, "."),
(Token::OpenParen, "("),
(Token::CloseParen, ")"),
(Token::Colon, ":"),
(Token::ExclamationMark, "!"),
(Token::QuestionMark, "?"),
(Token::Semicolon, ";"),
(Token::InvertedExclamationMark, "¡"),
(Token::InvertedQuestionMark, "¿"),
(Token::Ellipsis, "…"),
(Token::IdeographicFullStop, "。"),
(Token::FullwidthExclamationMark, "!"),
(Token::FullwidthQuestionMark, "?"),
(Token::ExclamationQuestionMark, "⁉"),
(Token::QuestionExclamationMark, "⁈"),
(Token::ArabicQuestionMark, "؟"),
(Token::ArabicFullStop, "۔"),
(Token::ArabicTripleDot, "؍"),
(Token::ArabicComma, "،"),
(Token::Danda, "।"),
(Token::DoubleDanda, "॥"),
(Token::BengaliCurrencyNumeratorFour, "৷"),
(Token::EthiopianFullStop, "።"),
(Token::ArmenianFullStop, "։"),
(Token::TibetanShad, "།"),
(Token::ThaiFongman, "๏"),
(Token::MyanmarComma, "၊"),
(Token::MyanmarFullStop, "။"),
(Token::Comma, ","),
(Token::Quote, "\""),
(Token::Equals, "="),
(Token::Number("42".to_string()), "42"),
(Token::Text("Hello".to_string()), "Hello"),
];
for (token, expected) in cases {
assert_eq!(token.to_lex_string(), expected, "Token {token:?}");
}
}
#[test]
fn to_lex_string_handles_blank_line_fallback_and_semantic_tokens() {
assert_eq!(Token::BlankLine(None).to_lex_string(), "\n");
assert_eq!(Token::Indent(vec![]).to_lex_string(), "");
assert_eq!(Token::Dedent(vec![]).to_lex_string(), "");
}
#[test]
fn detokenize_applies_indentation_levels() {
let tokens = vec![
Token::Text("Session".to_string()),
Token::BlankLine(Some("\n".to_string())),
Token::Indent(vec![]),
Token::Dash,
Token::Whitespace(1),
Token::Text("Item".to_string()),
Token::Whitespace(1),
Token::Number("1".to_string()),
Token::BlankLine(Some("\n".to_string())),
Token::Dedent(vec![]),
Token::Text("After".to_string()),
Token::BlankLine(Some("\n".to_string())),
];
let expected = "Session\n - Item 1\nAfter\n";
assert_eq!(detokenize(&tokens), expected);
}
#[test]
fn round_trips_all_element_specs() {
for path in collect_element_spec_files() {
assert_round_trip(&path);
}
}
#[test]
fn round_trips_all_document_specs() {
for doc in [DocumentType::Benchmark, DocumentType::Trifecta] {
let category = doc.dir_name();
for path in collect_files_by_number(category, None) {
assert_round_trip(&path);
}
}
}
fn collect_element_spec_files() -> Vec<PathBuf> {
let mut files = Vec::new();
for element in [
ElementType::Paragraph,
ElementType::List,
ElementType::Session,
ElementType::Definition,
ElementType::Annotation,
ElementType::Verbatim,
] {
let subcategory = element.dir_name();
files.extend(collect_files_by_number("elements", Some(subcategory)));
}
files
}
fn collect_files_by_number(category: &str, subcategory: Option<&str>) -> Vec<PathBuf> {
let root = specfile_finder::get_doc_root(category, subcategory);
let entries = specfile_finder::list_files_by_number(&root)
.unwrap_or_else(|err| panic!("Failed to read {}: {}", root.display(), err));
let mut items: Vec<_> = entries.into_iter().collect();
items.sort_by_key(|(num, _)| *num);
items.into_iter().map(|(_, path)| path).collect()
}
fn assert_round_trip(path: &Path) {
let source = fs::read_to_string(path)
.unwrap_or_else(|err| panic!("Failed to read {}: {}", path.display(), err));
let source_with_newline = ensure_source_ends_with_newline(&source);
let canonical_source = canonicalize_indentation(&source_with_newline);
let semantic_expected = strip_blank_line_whitespace(&canonical_source);
let raw_with_spans = tokenize(&source_with_newline);
let raw_tokens: Vec<_> = raw_with_spans.iter().map(|(t, _)| t.clone()).collect();
assert_eq!(
detokenize(&raw_tokens),
canonical_source,
"Raw token round trip failed for {}",
path.display()
);
let semantic_tokens = lex(raw_with_spans).unwrap();
let semantic_only: Vec<_> = semantic_tokens.iter().map(|(t, _)| t.clone()).collect();
assert_eq!(
detokenize(&semantic_only),
semantic_expected,
"Semantic token round trip failed for {}",
path.display()
);
}
fn canonicalize_indentation(source: &str) -> String {
source.replace('\t', " ")
}
fn strip_blank_line_whitespace(source: &str) -> String {
let mut normalized = String::with_capacity(source.len());
for chunk in source.split_inclusive('\n') {
if let Some(content) = chunk.strip_suffix('\n') {
if content.trim().is_empty() {
normalized.push('\n');
} else {
normalized.push_str(content);
normalized.push('\n');
}
} else {
normalized.push_str(chunk);
}
}
normalized
}
}