1use crate::common::keywords::key_symbols_sorted_by_len_desc;
9use crate::parse::TokenBlock;
10use crate::prelude::*;
11
12use std::collections::HashMap;
13use std::rc::Rc;
14
15pub struct Tokenizer {
16 pub macros_in_block_scopes: Vec<HashMap<String, Vec<String>>>,
17}
18
19impl Default for Tokenizer {
20 fn default() -> Self {
21 Self::new()
22 }
23}
24
25impl Tokenizer {
26 pub fn new() -> Self {
27 Self {
28 macros_in_block_scopes: vec![],
29 }
30 }
31
32 pub fn parse_blocks(
33 &mut self,
34 s: &str,
35 current_file_path: Rc<str>,
36 ) -> Result<Vec<TokenBlock>, RuntimeError> {
37 let stripped_source_code = self.strip_triple_quote_comment_blocks(s);
38 let lines: Vec<_> = stripped_source_code.lines().collect();
39 let mut i = 0;
40 self.parse_level(&lines, &mut i, 0, current_file_path)
41 }
42
43 pub fn tokenize_line(
44 &self,
45 line: &str,
46 line_file: LineFile,
47 ) -> Result<Vec<String>, RuntimeError> {
48 let raw_tokens = self.raw_tokenize_line(line);
49 self.expand_macro_tokens(raw_tokens, line_file)
50 }
51
52 fn raw_tokenize_line(&self, line: &str) -> Vec<String> {
53 let line = line.trim_end();
54 let symbols = key_symbols_sorted_by_len_desc();
55 let mut tokens = Vec::with_capacity(line.len());
56 let mut i = 0;
57 let bytes = line.as_bytes();
58
59 while i < bytes.len() {
60 if !line.is_char_boundary(i) {
61 let mut char_start = i;
62 while char_start > 0 && !line.is_char_boundary(char_start) {
63 char_start -= 1;
64 }
65 i = char_start;
66 continue;
67 }
68
69 if bytes[i] == b'#' {
70 break;
71 }
72
73 let ws_ch = line[i..].chars().next().unwrap_or('\0');
74 if ws_ch.is_whitespace() {
75 i += ws_ch.len_utf8();
76 continue;
77 }
78
79 let mut matched = false;
80 for &sym in &symbols {
81 let sym_length_bytes = sym.len();
82 if i + sym_length_bytes <= line.len()
83 && line.is_char_boundary(i)
84 && line.is_char_boundary(i + sym_length_bytes)
85 && &line[i..i + sym_length_bytes] == sym
86 {
87 tokens.push(sym.to_string());
88 i += sym_length_bytes;
89 matched = true;
90 break;
91 }
92 }
93 if matched {
94 continue;
95 }
96
97 if bytes[i].is_ascii_alphabetic() || bytes[i] == b'_' {
98 let start = i;
99 i += 1;
100 while i < bytes.len() && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'_') {
101 i += 1;
102 }
103 tokens.push(line[start..i].to_string());
104 continue;
105 }
106
107 if bytes[i] == b'@' {
108 let start = i;
109 i += 1;
110 while i < bytes.len() && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'_') {
111 i += 1;
112 }
113 tokens.push(line[start..i].to_string());
114 continue;
115 }
116
117 if bytes[i].is_ascii_digit() {
118 let start = i;
119 i += 1;
120 while i < bytes.len() && bytes[i].is_ascii_digit() {
121 i += 1;
122 }
123 if i + 1 < bytes.len() && bytes[i] == b'.' && bytes[i + 1].is_ascii_digit() {
124 i += 1;
125 while i < bytes.len() && bytes[i].is_ascii_digit() {
126 i += 1;
127 }
128 }
129 tokens.push(line[start..i].to_string());
130 continue;
131 }
132
133 let ch = line[i..].chars().next().unwrap_or('\0');
134 tokens.push(ch.to_string());
135 i += ch.len_utf8();
136 }
137 tokens
138 }
139
140 fn expand_macro_tokens(
141 &self,
142 raw_tokens: Vec<String>,
143 line_file: LineFile,
144 ) -> Result<Vec<String>, RuntimeError> {
145 let mut expanded = Vec::with_capacity(raw_tokens.len());
146 for token in raw_tokens {
147 if Self::is_macro_use_token(&token) {
148 let macro_name = &token[1..];
149 let macro_tokens = self.find_macro(macro_name).ok_or_else(|| {
150 Self::parse_error(format!("Unknown macro: {}", token), line_file.clone())
151 })?;
152 expanded.extend(macro_tokens.iter().cloned());
153 } else {
154 expanded.push(token);
155 }
156 }
157 Ok(expanded)
158 }
159
160 fn strip_triple_quote_comment_blocks(&self, source_code: &str) -> String {
161 let mut in_comment = false;
165 let line_count_upper_bound = source_code.lines().count();
166 let mut out_lines: Vec<String> = Vec::with_capacity(line_count_upper_bound);
167
168 for line in source_code.lines() {
169 let trimmed = line.trim();
170 let only_quote_chars = !trimmed.is_empty() && trimmed.chars().all(|c| c == '"');
171 if only_quote_chars {
172 in_comment = !in_comment;
173 out_lines.push(String::new());
174 continue;
175 }
176
177 if in_comment {
178 out_lines.push(String::new());
179 } else {
180 out_lines.push(line.to_string());
181 }
182 }
183
184 out_lines.join("\n")
185 }
186
187 fn parse_level(
188 &mut self,
189 lines: &[&str],
190 i: &mut usize,
191 base_indent: usize,
192 current_file_path: Rc<str>,
193 ) -> Result<Vec<TokenBlock>, RuntimeError> {
194 self.macros_in_block_scopes.push(HashMap::new());
195 let result = self.parse_level_in_current_scope(lines, i, base_indent, current_file_path);
196 self.macros_in_block_scopes.pop();
197 result
198 }
199
200 fn parse_level_in_current_scope(
201 &mut self,
202 lines: &[&str],
203 i: &mut usize,
204 base_indent: usize,
205 current_file_path: Rc<str>,
206 ) -> Result<Vec<TokenBlock>, RuntimeError> {
207 let remaining_line_count_upper_bound = lines.len().saturating_sub(*i);
208 let mut items = Vec::with_capacity(remaining_line_count_upper_bound);
209 let mut body_indent = None;
210
211 while *i < lines.len() {
212 let raw = lines[*i];
213 let line_no = *i + 1;
214 let line_file = (line_no, current_file_path.clone());
215 let indent = Self::indent_level(raw);
216 let content = raw.trim();
217
218 if content.is_empty() {
219 *i += 1;
220 continue;
221 }
222
223 if indent < base_indent {
224 break;
225 }
226
227 if indent > base_indent {
228 let trimmed_start = raw.trim_start();
232 if trimmed_start.is_empty() || trimmed_start.starts_with('#') {
233 *i += 1;
234 continue;
235 }
236 return Err(RuntimeError::from(ParseRuntimeError(
237 RuntimeErrorStruct::new_with_msg_and_line_file(
238 format!(
239 "unexpected indent at line {} in {}",
240 line_file.0,
241 line_file.1.as_ref()
242 ),
243 line_file,
244 ),
245 )));
246 }
247
248 *i += 1;
249
250 if self.register_macro_definition(content, line_file.clone())? {
253 continue;
254 }
255
256 let raw_header_tokens = self.raw_tokenize_line(content);
257 let header_tokens = self.expand_macro_tokens(raw_header_tokens, line_file.clone())?;
258 if header_tokens.is_empty() {
259 continue;
260 }
261
262 if Self::ends_with_colon(content) {
263 if *i >= lines.len() {
265 let line_file = (line_no, current_file_path.clone());
266 return Err(RuntimeError::from(ParseRuntimeError(
267 RuntimeErrorStruct::new_with_msg_and_line_file(
268 format!(
269 "block header missing body at line {} in {}",
270 line_file.0,
271 line_file.1.as_ref()
272 ),
273 line_file,
274 ),
275 )));
276 }
277
278 let next_indent = Self::indent_level(lines[*i]);
279 if next_indent <= indent {
280 let line_file = (*i + 1, current_file_path.clone());
281 return Err(RuntimeError::from(ParseRuntimeError(
282 RuntimeErrorStruct::new_with_msg_and_line_file(
283 format!(
284 "expected indent at line {} in {}",
285 line_file.0,
286 line_file.1.as_ref()
287 ),
288 line_file,
289 ),
290 )));
291 }
292
293 let body = self.parse_level(lines, i, next_indent, current_file_path.clone())?;
294 items.push(TokenBlock::new(
295 header_tokens,
296 body,
297 (line_no, current_file_path.clone()),
298 ));
299 } else {
300 items.push(TokenBlock::new(
301 header_tokens,
302 vec![],
303 (line_no, current_file_path.clone()),
304 ));
305 }
306
307 if let Some(expected) = body_indent {
308 if indent != expected {
309 let line_file = (line_no, current_file_path.clone());
310 return Err(RuntimeError::from(ParseRuntimeError(
311 RuntimeErrorStruct::new_with_msg_and_line_file(
312 format!(
313 "inconsistent indent at line {} in {}",
314 line_file.0,
315 line_file.1.as_ref()
316 ),
317 line_file,
318 ),
319 )));
320 }
321 } else {
322 body_indent = Some(indent);
323 }
324 }
325
326 Ok(items)
327 }
328
329 fn indent_level(line: &str) -> usize {
330 let mut n = 0;
331 for c in line.chars() {
332 match c {
333 ' ' => n += 1,
334 '\t' => n += 4,
335 _ => break,
336 }
337 }
338 n
339 }
340
341 fn ends_with_colon(s: &str) -> bool {
342 let trimmed = s.trim_end();
343 trimmed.ends_with(COLON)
344 }
345
346 fn register_macro_definition(
347 &mut self,
348 line: &str,
349 line_file: LineFile,
350 ) -> Result<bool, RuntimeError> {
351 let Some(mut rest) = line.strip_prefix("macro") else {
352 return Ok(false);
353 };
354 if !rest
355 .chars()
356 .next()
357 .map(|ch| ch.is_whitespace())
358 .unwrap_or(false)
359 {
360 return Ok(false);
361 }
362
363 if Self::ends_with_colon(line) {
364 return Err(Self::parse_error(
365 "macro definition cannot have a block body".to_string(),
366 line_file,
367 ));
368 }
369
370 rest = rest.trim_start();
371 let name_end = rest
372 .char_indices()
373 .find(|(_, ch)| ch.is_whitespace())
374 .map(|(index, _)| index)
375 .unwrap_or(rest.len());
376 let name = &rest[..name_end];
377 if !Self::is_macro_name(name) {
378 return Err(Self::parse_error(
379 format!("Invalid macro name: {}", name),
380 line_file,
381 ));
382 }
383
384 rest = rest[name_end..].trim_start();
385 let Some(replacement_and_tail) = rest.strip_prefix('"') else {
386 return Err(Self::parse_error(
387 "macro definition must be: macro name \"replacement\"".to_string(),
388 line_file,
389 ));
390 };
391
392 let mut replacement_end = None;
393 let mut escaped = false;
394 for (index, ch) in replacement_and_tail.char_indices() {
395 if escaped {
396 escaped = false;
397 continue;
398 }
399 if ch == '\\' {
400 escaped = true;
401 continue;
402 }
403 if ch == '"' {
404 replacement_end = Some(index);
405 break;
406 }
407 }
408
409 let Some(replacement_end) = replacement_end else {
410 return Err(Self::parse_error(
411 "macro replacement must end with a double quote".to_string(),
412 line_file,
413 ));
414 };
415 let replacement_source = &replacement_and_tail[..replacement_end];
416 let tail = replacement_and_tail[replacement_end + 1..].trim_start();
417 if !tail.is_empty() && !tail.starts_with('#') {
418 return Err(Self::parse_error(
419 "macro definition must be: macro name \"replacement\"".to_string(),
420 line_file,
421 ));
422 }
423
424 let replacement_tokens = self.raw_tokenize_line(replacement_source);
425 if replacement_tokens
426 .iter()
427 .any(|token| Self::is_macro_use_token(token) || token == "macro")
428 {
429 return Err(Self::parse_error(
430 "macro replacement cannot contain macro use or macro definition".to_string(),
431 line_file,
432 ));
433 }
434
435 let current_scope = self.macros_in_block_scopes.last_mut().ok_or_else(|| {
436 Self::parse_error(
437 "Internal tokenizer macro scope missing".to_string(),
438 line_file,
439 )
440 })?;
441 current_scope.insert(name.to_string(), replacement_tokens);
442 Ok(true)
443 }
444
445 fn find_macro(&self, name: &str) -> Option<&Vec<String>> {
446 self.macros_in_block_scopes
447 .iter()
448 .rev()
449 .find_map(|scope| scope.get(name))
450 }
451
452 fn is_macro_use_token(token: &str) -> bool {
453 let Some(name) = token.strip_prefix('@') else {
454 return false;
455 };
456 Self::is_macro_name(name)
457 }
458
459 fn is_macro_name(name: &str) -> bool {
460 let mut chars = name.chars();
461 match chars.next() {
462 Some(ch) if ch.is_ascii_alphabetic() || ch == '_' => {}
463 _ => return false,
464 }
465 chars.all(|ch| ch.is_ascii_alphanumeric() || ch == '_')
466 }
467
468 fn parse_error(message: String, line_file: LineFile) -> RuntimeError {
469 RuntimeError::from(ParseRuntimeError(
470 RuntimeErrorStruct::new_with_msg_and_line_file(message, line_file),
471 ))
472 }
473}
474
475#[cfg(test)]
476mod tests {
477 use super::Tokenizer;
478 use crate::prelude::*;
479 use std::rc::Rc;
480
481 fn test_line_file() -> LineFile {
482 (1, Rc::from("test.lit"))
483 }
484
485 #[test]
486 fn exist_bang_splits_into_exist_and_bang() {
487 let tokenizer = Tokenizer::new();
488 assert_eq!(
489 tokenizer
490 .tokenize_line("exist! a R st {}", test_line_file())
491 .unwrap(),
492 vec!["exist", "!", "a", "R", "st", "{", "}"]
493 );
494 }
495
496 #[test]
497 fn forall_bang_splits_like_exist_bang() {
498 let tokenizer = Tokenizer::new();
499 assert_eq!(
500 tokenizer
501 .tokenize_line("forall! x R: x > 0", test_line_file())
502 .unwrap(),
503 vec!["forall", "!", "x", "R", ":", "x", ">", "0"]
504 );
505 }
506
507 #[test]
508 fn exist_bang_with_whitespace() {
509 let tokenizer = Tokenizer::new();
510 assert_eq!(
511 tokenizer
512 .tokenize_line("exist ! a R st {}", test_line_file())
513 .unwrap(),
514 vec!["exist", "!", "a", "R", "st", "{", "}"]
515 );
516 }
517
518 #[test]
519 fn macro_definition_expands_later_line_in_same_block() {
520 let mut tokenizer = Tokenizer::new();
521 let blocks = tokenizer
522 .parse_blocks("macro eq \"a = b\"\nhave @eq", Rc::from("test.lit"))
523 .unwrap();
524 assert_eq!(blocks.len(), 1);
525 assert_eq!(blocks[0].header, vec!["have", "a", "=", "b"]);
526 }
527
528 #[test]
529 fn macro_scope_is_visible_in_child_block() {
530 let mut tokenizer = Tokenizer::new();
531 let blocks = tokenizer
532 .parse_blocks(
533 "macro eq \"a = b\"\nprove claim:\n have @eq",
534 Rc::from("test.lit"),
535 )
536 .unwrap();
537 assert_eq!(blocks[0].body[0].header, vec!["have", "a", "=", "b"]);
538 }
539
540 #[test]
541 fn macro_scope_expires_after_block() {
542 let mut tokenizer = Tokenizer::new();
543 let error = tokenizer
544 .parse_blocks(
545 "prove claim:\n macro eq \"a = b\"\n have @eq\nhave @eq",
546 Rc::from("test.lit"),
547 )
548 .unwrap_err();
549 assert!(format!("{:?}", error).contains("Unknown macro: @eq"));
550 }
551
552 #[test]
553 fn inner_macro_overrides_outer_macro_only_inside_child_block() {
554 let mut tokenizer = Tokenizer::new();
555 let blocks = tokenizer
556 .parse_blocks(
557 "macro x \"outer\"\nprove claim:\n macro x \"inner\"\n have @x\nhave @x",
558 Rc::from("test.lit"),
559 )
560 .unwrap();
561 assert_eq!(blocks[0].body[0].header, vec!["have", "inner"]);
562 assert_eq!(blocks[1].header, vec!["have", "outer"]);
563 }
564
565 #[test]
566 fn macro_replacement_cannot_use_macro() {
567 let mut tokenizer = Tokenizer::new();
568 let error = tokenizer
569 .parse_blocks("macro x \"@other\"", Rc::from("test.lit"))
570 .unwrap_err();
571 assert!(format!("{:?}", error).contains("macro replacement cannot contain"));
572 }
573}