Skip to main content

lex_core/lex/token/
formatting.rs

1//! Detokenizer for the lex format
2//!
3//! This module provides functionality to convert a stream of tokens back into a string.
4//!
5//! Unlike other formatters in this module which work on AST `Document` objects,
6//! the detokenizer works at the token level, converting token streams back to
7//! source text. This is useful for:
8//!
9//! - Round-trip testing (source -> tokens -> source)
10//! - Token-level transformations that preserve the original format
11//! - Debugging and visualization of token streams
12//!
13//! The detokenizer handles:
14//! - Raw tokens (basic token -> string conversion)
15//! - Semantic indentation tokens (Indent/Dedent) for proper formatting
16
17use super::core::Token;
18
19/// Trait for converting a token to its string representation
20pub trait ToLexString {
21    fn to_lex_string(&self) -> String;
22}
23
24impl ToLexString for Token {
25    fn to_lex_string(&self) -> String {
26        match self {
27            Token::LexMarker => "::".to_string(),
28            Token::Indentation => "    ".to_string(),
29            Token::Whitespace(count) => " ".repeat(*count),
30            // BlankLine should always contain the newline character(s) for round-trip fidelity.
31            // The logos regex always produces Some(...), but we default to "\n" for safety.
32            Token::BlankLine(s) => s.as_deref().unwrap_or("\n").to_string(),
33            Token::Dash => "-".to_string(),
34            Token::Period => ".".to_string(),
35            Token::OpenParen => "(".to_string(),
36            Token::CloseParen => ")".to_string(),
37            Token::Colon => ":".to_string(),
38            Token::ExclamationMark => "!".to_string(),
39            Token::QuestionMark => "?".to_string(),
40            Token::Semicolon => ";".to_string(),
41            Token::InvertedExclamationMark => "¡".to_string(),
42            Token::InvertedQuestionMark => "¿".to_string(),
43            Token::Ellipsis => "…".to_string(),
44            Token::IdeographicFullStop => "。".to_string(),
45            Token::FullwidthExclamationMark => "!".to_string(),
46            Token::FullwidthQuestionMark => "?".to_string(),
47            Token::ExclamationQuestionMark => "⁉".to_string(),
48            Token::QuestionExclamationMark => "⁈".to_string(),
49            Token::ArabicQuestionMark => "؟".to_string(),
50            Token::ArabicFullStop => "۔".to_string(),
51            Token::ArabicTripleDot => "؍".to_string(),
52            Token::ArabicComma => "،".to_string(),
53            Token::Danda => "।".to_string(),
54            Token::DoubleDanda => "॥".to_string(),
55            Token::BengaliCurrencyNumeratorFour => "৷".to_string(),
56            Token::EthiopianFullStop => "።".to_string(),
57            Token::ArmenianFullStop => "։".to_string(),
58            Token::TibetanShad => "།".to_string(),
59            Token::ThaiFongman => "๏".to_string(),
60            Token::MyanmarComma => "၊".to_string(),
61            Token::MyanmarFullStop => "။".to_string(),
62            Token::Comma => ",".to_string(),
63            Token::Quote => "\"".to_string(),
64            Token::Equals => "=".to_string(),
65            Token::Number(s) => s.clone(),
66            Token::Text(s) => s.clone(),
67            // The following tokens are synthetic and should not be part of the detokenized output
68            Token::Indent(_) | Token::Dedent(_) => String::new(),
69        }
70    }
71}
72
73/// Detokenize a stream of tokens into a string
74///
75/// This function converts a sequence of tokens back to source text,
76/// handling semantic indentation (Indent/Dedent tokens) to reconstruct
77/// the proper indentation structure.
78///
79/// # Arguments
80///
81/// * `tokens` - Slice of tokens to detokenize
82///
83/// # Returns
84///
85/// A string representation of the tokens with proper indentation
86///
87/// # Examples
88///
89/// ```ignore
90/// use lex::lex::formats::detokenizer::detokenize;
91/// use lex::lex::lexing::tokenize;
92///
93/// let source = "Hello world";
94/// let tokens: Vec<_> = tokenize(source).into_iter().map(|(t, _)| t).collect();
95/// let result = detokenize(&tokens);
96/// assert_eq!(result, source);
97/// ```
98pub fn detokenize(tokens: &[Token]) -> String {
99    let mut result = String::new();
100    let mut indent_level = 0;
101
102    for token in tokens {
103        match token {
104            Token::Indent(_) => indent_level += 1,
105            Token::Dedent(_) => indent_level -= 1,
106            Token::BlankLine(_) => {
107                result.push_str(&token.to_lex_string());
108            }
109            _ => {
110                if result.ends_with('\n') || result.is_empty() {
111                    for _ in 0..indent_level {
112                        result.push_str("    ");
113                    }
114                }
115                result.push_str(&token.to_lex_string());
116            }
117        }
118    }
119
120    result
121}
122
123#[cfg(test)]
124mod tests {
125    use super::*;
126    use crate::lex::lexing::{ensure_source_ends_with_newline, lex, tokenize};
127    use crate::lex::testing::lexplore::specfile_finder::{self, DocumentType, ElementType};
128    use std::fs;
129    use std::path::{Path, PathBuf};
130
131    #[test]
132    fn to_lex_string_maps_every_literal_token() {
133        let cases: Vec<(Token, &str)> = vec![
134            (Token::LexMarker, "::"),
135            (Token::Indentation, "    "),
136            (Token::Whitespace(1), " "),
137            (Token::BlankLine(Some("\n".to_string())), "\n"),
138            (Token::Dash, "-"),
139            (Token::Period, "."),
140            (Token::OpenParen, "("),
141            (Token::CloseParen, ")"),
142            (Token::Colon, ":"),
143            (Token::ExclamationMark, "!"),
144            (Token::QuestionMark, "?"),
145            (Token::Semicolon, ";"),
146            (Token::InvertedExclamationMark, "¡"),
147            (Token::InvertedQuestionMark, "¿"),
148            (Token::Ellipsis, "…"),
149            (Token::IdeographicFullStop, "。"),
150            (Token::FullwidthExclamationMark, "!"),
151            (Token::FullwidthQuestionMark, "?"),
152            (Token::ExclamationQuestionMark, "⁉"),
153            (Token::QuestionExclamationMark, "⁈"),
154            (Token::ArabicQuestionMark, "؟"),
155            (Token::ArabicFullStop, "۔"),
156            (Token::ArabicTripleDot, "؍"),
157            (Token::ArabicComma, "،"),
158            (Token::Danda, "।"),
159            (Token::DoubleDanda, "॥"),
160            (Token::BengaliCurrencyNumeratorFour, "৷"),
161            (Token::EthiopianFullStop, "።"),
162            (Token::ArmenianFullStop, "։"),
163            (Token::TibetanShad, "།"),
164            (Token::ThaiFongman, "๏"),
165            (Token::MyanmarComma, "၊"),
166            (Token::MyanmarFullStop, "။"),
167            (Token::Comma, ","),
168            (Token::Quote, "\""),
169            (Token::Equals, "="),
170            (Token::Number("42".to_string()), "42"),
171            (Token::Text("Hello".to_string()), "Hello"),
172        ];
173
174        for (token, expected) in cases {
175            assert_eq!(token.to_lex_string(), expected, "Token {token:?}");
176        }
177    }
178
179    #[test]
180    fn to_lex_string_handles_blank_line_fallback_and_semantic_tokens() {
181        assert_eq!(Token::BlankLine(None).to_lex_string(), "\n");
182        assert_eq!(Token::Indent(vec![]).to_lex_string(), "");
183        assert_eq!(Token::Dedent(vec![]).to_lex_string(), "");
184    }
185
186    #[test]
187    fn detokenize_applies_indentation_levels() {
188        let tokens = vec![
189            Token::Text("Session".to_string()),
190            Token::BlankLine(Some("\n".to_string())),
191            Token::Indent(vec![]),
192            Token::Dash,
193            Token::Whitespace(1),
194            Token::Text("Item".to_string()),
195            Token::Whitespace(1),
196            Token::Number("1".to_string()),
197            Token::BlankLine(Some("\n".to_string())),
198            Token::Dedent(vec![]),
199            Token::Text("After".to_string()),
200            Token::BlankLine(Some("\n".to_string())),
201        ];
202
203        let expected = "Session\n    - Item 1\nAfter\n";
204        assert_eq!(detokenize(&tokens), expected);
205    }
206
207    #[test]
208    fn round_trips_all_element_specs() {
209        for path in collect_element_spec_files() {
210            assert_round_trip(&path);
211        }
212    }
213
214    #[test]
215    fn round_trips_all_document_specs() {
216        for doc in [DocumentType::Benchmark, DocumentType::Trifecta] {
217            let category = doc.dir_name();
218            for path in collect_files_by_number(category, None) {
219                assert_round_trip(&path);
220            }
221        }
222    }
223
224    fn collect_element_spec_files() -> Vec<PathBuf> {
225        let mut files = Vec::new();
226        for element in [
227            ElementType::Paragraph,
228            ElementType::List,
229            ElementType::Session,
230            ElementType::Definition,
231            ElementType::Annotation,
232            ElementType::Verbatim,
233        ] {
234            let subcategory = element.dir_name();
235            files.extend(collect_files_by_number("elements", Some(subcategory)));
236        }
237        files
238    }
239
240    fn collect_files_by_number(category: &str, subcategory: Option<&str>) -> Vec<PathBuf> {
241        let root = specfile_finder::get_doc_root(category, subcategory);
242        let entries = specfile_finder::list_files_by_number(&root)
243            .unwrap_or_else(|err| panic!("Failed to read {}: {}", root.display(), err));
244        let mut items: Vec<_> = entries.into_iter().collect();
245        items.sort_by_key(|(num, _)| *num);
246        items.into_iter().map(|(_, path)| path).collect()
247    }
248
249    fn assert_round_trip(path: &Path) {
250        let source = fs::read_to_string(path)
251            .unwrap_or_else(|err| panic!("Failed to read {}: {}", path.display(), err));
252        let source_with_newline = ensure_source_ends_with_newline(&source);
253        let canonical_source = canonicalize_indentation(&source_with_newline);
254        let semantic_expected = strip_blank_line_whitespace(&canonical_source);
255        let raw_with_spans = tokenize(&source_with_newline);
256        let raw_tokens: Vec<_> = raw_with_spans.iter().map(|(t, _)| t.clone()).collect();
257        assert_eq!(
258            detokenize(&raw_tokens),
259            canonical_source,
260            "Raw token round trip failed for {}",
261            path.display()
262        );
263
264        let semantic_tokens = lex(raw_with_spans).unwrap();
265        let semantic_only: Vec<_> = semantic_tokens.iter().map(|(t, _)| t.clone()).collect();
266        assert_eq!(
267            detokenize(&semantic_only),
268            semantic_expected,
269            "Semantic token round trip failed for {}",
270            path.display()
271        );
272    }
273
274    fn canonicalize_indentation(source: &str) -> String {
275        source.replace('\t', "    ")
276    }
277
278    fn strip_blank_line_whitespace(source: &str) -> String {
279        let mut normalized = String::with_capacity(source.len());
280        for chunk in source.split_inclusive('\n') {
281            if let Some(content) = chunk.strip_suffix('\n') {
282                if content.trim().is_empty() {
283                    normalized.push('\n');
284                } else {
285                    normalized.push_str(content);
286                    normalized.push('\n');
287                }
288            } else {
289                normalized.push_str(chunk);
290            }
291        }
292        normalized
293    }
294}