1use super::core::Token;
18
19pub trait ToLexString {
21 fn to_lex_string(&self) -> String;
22}
23
24impl ToLexString for Token {
25 fn to_lex_string(&self) -> String {
26 match self {
27 Token::LexMarker => "::".to_string(),
28 Token::Indentation => " ".to_string(),
29 Token::Whitespace(count) => " ".repeat(*count),
30 Token::BlankLine(s) => s.as_deref().unwrap_or("\n").to_string(),
33 Token::Dash => "-".to_string(),
34 Token::Period => ".".to_string(),
35 Token::OpenParen => "(".to_string(),
36 Token::CloseParen => ")".to_string(),
37 Token::Colon => ":".to_string(),
38 Token::ExclamationMark => "!".to_string(),
39 Token::QuestionMark => "?".to_string(),
40 Token::Semicolon => ";".to_string(),
41 Token::InvertedExclamationMark => "¡".to_string(),
42 Token::InvertedQuestionMark => "¿".to_string(),
43 Token::Ellipsis => "…".to_string(),
44 Token::IdeographicFullStop => "。".to_string(),
45 Token::FullwidthExclamationMark => "!".to_string(),
46 Token::FullwidthQuestionMark => "?".to_string(),
47 Token::ExclamationQuestionMark => "⁉".to_string(),
48 Token::QuestionExclamationMark => "⁈".to_string(),
49 Token::ArabicQuestionMark => "؟".to_string(),
50 Token::ArabicFullStop => "۔".to_string(),
51 Token::ArabicTripleDot => "؍".to_string(),
52 Token::ArabicComma => "،".to_string(),
53 Token::Danda => "।".to_string(),
54 Token::DoubleDanda => "॥".to_string(),
55 Token::BengaliCurrencyNumeratorFour => "৷".to_string(),
56 Token::EthiopianFullStop => "።".to_string(),
57 Token::ArmenianFullStop => "։".to_string(),
58 Token::TibetanShad => "།".to_string(),
59 Token::ThaiFongman => "๏".to_string(),
60 Token::MyanmarComma => "၊".to_string(),
61 Token::MyanmarFullStop => "။".to_string(),
62 Token::Comma => ",".to_string(),
63 Token::Quote => "\"".to_string(),
64 Token::Equals => "=".to_string(),
65 Token::Number(s) => s.clone(),
66 Token::Text(s) => s.clone(),
67 Token::Indent(_) | Token::Dedent(_) => String::new(),
69 }
70 }
71}
72
73pub fn detokenize(tokens: &[Token]) -> String {
99 let mut result = String::new();
100 let mut indent_level = 0;
101
102 for token in tokens {
103 match token {
104 Token::Indent(_) => indent_level += 1,
105 Token::Dedent(_) => indent_level -= 1,
106 Token::BlankLine(_) => {
107 result.push_str(&token.to_lex_string());
108 }
109 _ => {
110 if result.ends_with('\n') || result.is_empty() {
111 for _ in 0..indent_level {
112 result.push_str(" ");
113 }
114 }
115 result.push_str(&token.to_lex_string());
116 }
117 }
118 }
119
120 result
121}
122
123#[cfg(test)]
124mod tests {
125 use super::*;
126 use crate::lex::lexing::{ensure_source_ends_with_newline, lex, tokenize};
127 use crate::lex::testing::lexplore::specfile_finder::{self, DocumentType, ElementType};
128 use std::fs;
129 use std::path::{Path, PathBuf};
130
131 #[test]
132 fn to_lex_string_maps_every_literal_token() {
133 let cases: Vec<(Token, &str)> = vec![
134 (Token::LexMarker, "::"),
135 (Token::Indentation, " "),
136 (Token::Whitespace(1), " "),
137 (Token::BlankLine(Some("\n".to_string())), "\n"),
138 (Token::Dash, "-"),
139 (Token::Period, "."),
140 (Token::OpenParen, "("),
141 (Token::CloseParen, ")"),
142 (Token::Colon, ":"),
143 (Token::ExclamationMark, "!"),
144 (Token::QuestionMark, "?"),
145 (Token::Semicolon, ";"),
146 (Token::InvertedExclamationMark, "¡"),
147 (Token::InvertedQuestionMark, "¿"),
148 (Token::Ellipsis, "…"),
149 (Token::IdeographicFullStop, "。"),
150 (Token::FullwidthExclamationMark, "!"),
151 (Token::FullwidthQuestionMark, "?"),
152 (Token::ExclamationQuestionMark, "⁉"),
153 (Token::QuestionExclamationMark, "⁈"),
154 (Token::ArabicQuestionMark, "؟"),
155 (Token::ArabicFullStop, "۔"),
156 (Token::ArabicTripleDot, "؍"),
157 (Token::ArabicComma, "،"),
158 (Token::Danda, "।"),
159 (Token::DoubleDanda, "॥"),
160 (Token::BengaliCurrencyNumeratorFour, "৷"),
161 (Token::EthiopianFullStop, "።"),
162 (Token::ArmenianFullStop, "։"),
163 (Token::TibetanShad, "།"),
164 (Token::ThaiFongman, "๏"),
165 (Token::MyanmarComma, "၊"),
166 (Token::MyanmarFullStop, "။"),
167 (Token::Comma, ","),
168 (Token::Quote, "\""),
169 (Token::Equals, "="),
170 (Token::Number("42".to_string()), "42"),
171 (Token::Text("Hello".to_string()), "Hello"),
172 ];
173
174 for (token, expected) in cases {
175 assert_eq!(token.to_lex_string(), expected, "Token {token:?}");
176 }
177 }
178
179 #[test]
180 fn to_lex_string_handles_blank_line_fallback_and_semantic_tokens() {
181 assert_eq!(Token::BlankLine(None).to_lex_string(), "\n");
182 assert_eq!(Token::Indent(vec![]).to_lex_string(), "");
183 assert_eq!(Token::Dedent(vec![]).to_lex_string(), "");
184 }
185
186 #[test]
187 fn detokenize_applies_indentation_levels() {
188 let tokens = vec![
189 Token::Text("Session".to_string()),
190 Token::BlankLine(Some("\n".to_string())),
191 Token::Indent(vec![]),
192 Token::Dash,
193 Token::Whitespace(1),
194 Token::Text("Item".to_string()),
195 Token::Whitespace(1),
196 Token::Number("1".to_string()),
197 Token::BlankLine(Some("\n".to_string())),
198 Token::Dedent(vec![]),
199 Token::Text("After".to_string()),
200 Token::BlankLine(Some("\n".to_string())),
201 ];
202
203 let expected = "Session\n - Item 1\nAfter\n";
204 assert_eq!(detokenize(&tokens), expected);
205 }
206
207 #[test]
208 fn round_trips_all_element_specs() {
209 for path in collect_element_spec_files() {
210 assert_round_trip(&path);
211 }
212 }
213
214 #[test]
215 fn round_trips_all_document_specs() {
216 for doc in [DocumentType::Benchmark, DocumentType::Trifecta] {
217 let category = doc.dir_name();
218 for path in collect_files_by_number(category, None) {
219 assert_round_trip(&path);
220 }
221 }
222 }
223
224 fn collect_element_spec_files() -> Vec<PathBuf> {
225 let mut files = Vec::new();
226 for element in [
227 ElementType::Paragraph,
228 ElementType::List,
229 ElementType::Session,
230 ElementType::Definition,
231 ElementType::Annotation,
232 ElementType::Verbatim,
233 ] {
234 let subcategory = element.dir_name();
235 files.extend(collect_files_by_number("elements", Some(subcategory)));
236 }
237 files
238 }
239
240 fn collect_files_by_number(category: &str, subcategory: Option<&str>) -> Vec<PathBuf> {
241 let root = specfile_finder::get_doc_root(category, subcategory);
242 let entries = specfile_finder::list_files_by_number(&root)
243 .unwrap_or_else(|err| panic!("Failed to read {}: {}", root.display(), err));
244 let mut items: Vec<_> = entries.into_iter().collect();
245 items.sort_by_key(|(num, _)| *num);
246 items.into_iter().map(|(_, path)| path).collect()
247 }
248
249 fn assert_round_trip(path: &Path) {
250 let source = fs::read_to_string(path)
251 .unwrap_or_else(|err| panic!("Failed to read {}: {}", path.display(), err));
252 let source_with_newline = ensure_source_ends_with_newline(&source);
253 let canonical_source = canonicalize_indentation(&source_with_newline);
254 let semantic_expected = strip_blank_line_whitespace(&canonical_source);
255 let raw_with_spans = tokenize(&source_with_newline);
256 let raw_tokens: Vec<_> = raw_with_spans.iter().map(|(t, _)| t.clone()).collect();
257 assert_eq!(
258 detokenize(&raw_tokens),
259 canonical_source,
260 "Raw token round trip failed for {}",
261 path.display()
262 );
263
264 let semantic_tokens = lex(raw_with_spans).unwrap();
265 let semantic_only: Vec<_> = semantic_tokens.iter().map(|(t, _)| t.clone()).collect();
266 assert_eq!(
267 detokenize(&semantic_only),
268 semantic_expected,
269 "Semantic token round trip failed for {}",
270 path.display()
271 );
272 }
273
274 fn canonicalize_indentation(source: &str) -> String {
275 source.replace('\t', " ")
276 }
277
278 fn strip_blank_line_whitespace(source: &str) -> String {
279 let mut normalized = String::with_capacity(source.len());
280 for chunk in source.split_inclusive('\n') {
281 if let Some(content) = chunk.strip_suffix('\n') {
282 if content.trim().is_empty() {
283 normalized.push('\n');
284 } else {
285 normalized.push_str(content);
286 normalized.push('\n');
287 }
288 } else {
289 normalized.push_str(chunk);
290 }
291 }
292 normalized
293 }
294}