Skip to main content

litex/parse/
tokenizer.rs

1//! Tokenizer: splits source text into token blocks for the parser.
2//!
3//! - `#` starts an end-of-line comment.
4//! - Multi-character symbols from [`crate::common::keywords::key_symbols_sorted_by_len_desc`] are
5//!   matched with longest-first priority.
6//! - Double-quoted segments are one token (with `\"` and `\\` skips for the closing quote).
7
8use crate::common::keywords::key_symbols_sorted_by_len_desc;
9use crate::parse::TokenBlock;
10use crate::prelude::*;
11
12use std::collections::HashMap;
13use std::rc::Rc;
14
15pub struct Tokenizer {
16    pub macros_in_block_scopes: Vec<HashMap<String, Vec<String>>>,
17}
18
19impl Default for Tokenizer {
20    fn default() -> Self {
21        Self::new()
22    }
23}
24
25impl Tokenizer {
26    pub fn new() -> Self {
27        Self {
28            macros_in_block_scopes: vec![],
29        }
30    }
31
32    pub fn parse_blocks(
33        &mut self,
34        s: &str,
35        current_file_path: Rc<str>,
36    ) -> Result<Vec<TokenBlock>, RuntimeError> {
37        let stripped_source_code = self.strip_triple_quote_comment_blocks(s);
38        let lines: Vec<_> = stripped_source_code.lines().collect();
39        let mut i = 0;
40        self.parse_level(&lines, &mut i, 0, current_file_path)
41    }
42
43    pub fn tokenize_line(
44        &self,
45        line: &str,
46        line_file: LineFile,
47    ) -> Result<Vec<String>, RuntimeError> {
48        let raw_tokens = self.raw_tokenize_line(line);
49        self.expand_macro_tokens(raw_tokens, line_file)
50    }
51
52    fn raw_tokenize_line(&self, line: &str) -> Vec<String> {
53        let line = line.trim_end();
54        let symbols = key_symbols_sorted_by_len_desc();
55        let mut tokens = Vec::with_capacity(line.len());
56        let mut i = 0;
57        let bytes = line.as_bytes();
58
59        while i < bytes.len() {
60            if !line.is_char_boundary(i) {
61                let mut char_start = i;
62                while char_start > 0 && !line.is_char_boundary(char_start) {
63                    char_start -= 1;
64                }
65                i = char_start;
66                continue;
67            }
68
69            if bytes[i] == b'#' {
70                break;
71            }
72
73            let ws_ch = line[i..].chars().next().unwrap_or('\0');
74            if ws_ch.is_whitespace() {
75                i += ws_ch.len_utf8();
76                continue;
77            }
78
79            let mut matched = false;
80            for &sym in &symbols {
81                let sym_length_bytes = sym.len();
82                if i + sym_length_bytes <= line.len()
83                    && line.is_char_boundary(i)
84                    && line.is_char_boundary(i + sym_length_bytes)
85                    && &line[i..i + sym_length_bytes] == sym
86                {
87                    tokens.push(sym.to_string());
88                    i += sym_length_bytes;
89                    matched = true;
90                    break;
91                }
92            }
93            if matched {
94                continue;
95            }
96
97            if bytes[i].is_ascii_alphabetic() || bytes[i] == b'_' {
98                let start = i;
99                i += 1;
100                while i < bytes.len() && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'_') {
101                    i += 1;
102                }
103                tokens.push(line[start..i].to_string());
104                continue;
105            }
106
107            if bytes[i] == b'@' {
108                let start = i;
109                i += 1;
110                while i < bytes.len() && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'_') {
111                    i += 1;
112                }
113                tokens.push(line[start..i].to_string());
114                continue;
115            }
116
117            if bytes[i].is_ascii_digit() {
118                let start = i;
119                i += 1;
120                while i < bytes.len() && bytes[i].is_ascii_digit() {
121                    i += 1;
122                }
123                if i + 1 < bytes.len() && bytes[i] == b'.' && bytes[i + 1].is_ascii_digit() {
124                    i += 1;
125                    while i < bytes.len() && bytes[i].is_ascii_digit() {
126                        i += 1;
127                    }
128                }
129                tokens.push(line[start..i].to_string());
130                continue;
131            }
132
133            let ch = line[i..].chars().next().unwrap_or('\0');
134            tokens.push(ch.to_string());
135            i += ch.len_utf8();
136        }
137        tokens
138    }
139
140    fn expand_macro_tokens(
141        &self,
142        raw_tokens: Vec<String>,
143        line_file: LineFile,
144    ) -> Result<Vec<String>, RuntimeError> {
145        let mut expanded = Vec::with_capacity(raw_tokens.len());
146        for token in raw_tokens {
147            if Self::is_macro_use_token(&token) {
148                let macro_name = &token[1..];
149                let macro_tokens = self.find_macro(macro_name).ok_or_else(|| {
150                    Self::parse_error(format!("Unknown macro: {}", token), line_file.clone())
151                })?;
152                expanded.extend(macro_tokens.iter().cloned());
153            } else {
154                expanded.push(token);
155            }
156        }
157        Ok(expanded)
158    }
159
160    fn strip_triple_quote_comment_blocks(&self, source_code: &str) -> String {
161        // Treat a line that consists only of `"` characters (after trimming) as a delimiter.
162        // Between two delimiter lines, everything is replaced with empty lines so
163        // the parser will ignore those lines.
164        let mut in_comment = false;
165        let line_count_upper_bound = source_code.lines().count();
166        let mut out_lines: Vec<String> = Vec::with_capacity(line_count_upper_bound);
167
168        for line in source_code.lines() {
169            let trimmed = line.trim();
170            let only_quote_chars = !trimmed.is_empty() && trimmed.chars().all(|c| c == '"');
171            if only_quote_chars {
172                in_comment = !in_comment;
173                out_lines.push(String::new());
174                continue;
175            }
176
177            if in_comment {
178                out_lines.push(String::new());
179            } else {
180                out_lines.push(line.to_string());
181            }
182        }
183
184        out_lines.join("\n")
185    }
186
187    fn parse_level(
188        &mut self,
189        lines: &[&str],
190        i: &mut usize,
191        base_indent: usize,
192        current_file_path: Rc<str>,
193    ) -> Result<Vec<TokenBlock>, RuntimeError> {
194        self.macros_in_block_scopes.push(HashMap::new());
195        let result = self.parse_level_in_current_scope(lines, i, base_indent, current_file_path);
196        self.macros_in_block_scopes.pop();
197        result
198    }
199
200    fn parse_level_in_current_scope(
201        &mut self,
202        lines: &[&str],
203        i: &mut usize,
204        base_indent: usize,
205        current_file_path: Rc<str>,
206    ) -> Result<Vec<TokenBlock>, RuntimeError> {
207        let remaining_line_count_upper_bound = lines.len().saturating_sub(*i);
208        let mut items = Vec::with_capacity(remaining_line_count_upper_bound);
209        let mut body_indent = None;
210
211        while *i < lines.len() {
212            let raw = lines[*i];
213            let line_no = *i + 1;
214            let line_file = (line_no, current_file_path.clone());
215            let indent = Self::indent_level(raw);
216            let content = raw.trim();
217
218            if content.is_empty() {
219                *i += 1;
220                continue;
221            }
222
223            if indent < base_indent {
224                break;
225            }
226
227            if indent > base_indent {
228                // Indented farther than the current block: normally a syntax error, but allow lines that
229                // are only a `# ...` comment (any leading spaces/tabs before `#`). Writers often align
230                // such comments with a surrounding block without attaching them to a sibling item.
231                let trimmed_start = raw.trim_start();
232                if trimmed_start.is_empty() || trimmed_start.starts_with('#') {
233                    *i += 1;
234                    continue;
235                }
236                return Err(RuntimeError::from(ParseRuntimeError(
237                    RuntimeErrorStruct::new_with_msg_and_line_file(
238                        format!(
239                            "unexpected indent at line {} in {}",
240                            line_file.0,
241                            line_file.1.as_ref()
242                        ),
243                        line_file,
244                    ),
245                )));
246            }
247
248            *i += 1;
249
250            // Tokenize header; if it's empty (e.g. whole line comment),
251            // treat it like a blank line for block parsing.
252            if self.register_macro_definition(content, line_file.clone())? {
253                continue;
254            }
255
256            let raw_header_tokens = self.raw_tokenize_line(content);
257            let header_tokens = self.expand_macro_tokens(raw_header_tokens, line_file.clone())?;
258            if header_tokens.is_empty() {
259                continue;
260            }
261
262            if Self::ends_with_colon(content) {
263                // 必须有 body
264                if *i >= lines.len() {
265                    let line_file = (line_no, current_file_path.clone());
266                    return Err(RuntimeError::from(ParseRuntimeError(
267                        RuntimeErrorStruct::new_with_msg_and_line_file(
268                            format!(
269                                "block header missing body at line {} in {}",
270                                line_file.0,
271                                line_file.1.as_ref()
272                            ),
273                            line_file,
274                        ),
275                    )));
276                }
277
278                let next_indent = Self::indent_level(lines[*i]);
279                if next_indent <= indent {
280                    let line_file = (*i + 1, current_file_path.clone());
281                    return Err(RuntimeError::from(ParseRuntimeError(
282                        RuntimeErrorStruct::new_with_msg_and_line_file(
283                            format!(
284                                "expected indent at line {} in {}",
285                                line_file.0,
286                                line_file.1.as_ref()
287                            ),
288                            line_file,
289                        ),
290                    )));
291                }
292
293                let body = self.parse_level(lines, i, next_indent, current_file_path.clone())?;
294                items.push(TokenBlock::new(
295                    header_tokens,
296                    body,
297                    (line_no, current_file_path.clone()),
298                ));
299            } else {
300                items.push(TokenBlock::new(
301                    header_tokens,
302                    vec![],
303                    (line_no, current_file_path.clone()),
304                ));
305            }
306
307            if let Some(expected) = body_indent {
308                if indent != expected {
309                    let line_file = (line_no, current_file_path.clone());
310                    return Err(RuntimeError::from(ParseRuntimeError(
311                        RuntimeErrorStruct::new_with_msg_and_line_file(
312                            format!(
313                                "inconsistent indent at line {} in {}",
314                                line_file.0,
315                                line_file.1.as_ref()
316                            ),
317                            line_file,
318                        ),
319                    )));
320                }
321            } else {
322                body_indent = Some(indent);
323            }
324        }
325
326        Ok(items)
327    }
328
329    fn indent_level(line: &str) -> usize {
330        let mut n = 0;
331        for c in line.chars() {
332            match c {
333                ' ' => n += 1,
334                '\t' => n += 4,
335                _ => break,
336            }
337        }
338        n
339    }
340
341    fn ends_with_colon(s: &str) -> bool {
342        let trimmed = s.trim_end();
343        trimmed.ends_with(COLON)
344    }
345
346    fn register_macro_definition(
347        &mut self,
348        line: &str,
349        line_file: LineFile,
350    ) -> Result<bool, RuntimeError> {
351        let Some(mut rest) = line.strip_prefix("macro") else {
352            return Ok(false);
353        };
354        if !rest
355            .chars()
356            .next()
357            .map(|ch| ch.is_whitespace())
358            .unwrap_or(false)
359        {
360            return Ok(false);
361        }
362
363        if Self::ends_with_colon(line) {
364            return Err(Self::parse_error(
365                "macro definition cannot have a block body".to_string(),
366                line_file,
367            ));
368        }
369
370        rest = rest.trim_start();
371        let name_end = rest
372            .char_indices()
373            .find(|(_, ch)| ch.is_whitespace())
374            .map(|(index, _)| index)
375            .unwrap_or(rest.len());
376        let name = &rest[..name_end];
377        if !Self::is_macro_name(name) {
378            return Err(Self::parse_error(
379                format!("Invalid macro name: {}", name),
380                line_file,
381            ));
382        }
383
384        rest = rest[name_end..].trim_start();
385        let Some(replacement_and_tail) = rest.strip_prefix('"') else {
386            return Err(Self::parse_error(
387                "macro definition must be: macro name \"replacement\"".to_string(),
388                line_file,
389            ));
390        };
391
392        let mut replacement_end = None;
393        let mut escaped = false;
394        for (index, ch) in replacement_and_tail.char_indices() {
395            if escaped {
396                escaped = false;
397                continue;
398            }
399            if ch == '\\' {
400                escaped = true;
401                continue;
402            }
403            if ch == '"' {
404                replacement_end = Some(index);
405                break;
406            }
407        }
408
409        let Some(replacement_end) = replacement_end else {
410            return Err(Self::parse_error(
411                "macro replacement must end with a double quote".to_string(),
412                line_file,
413            ));
414        };
415        let replacement_source = &replacement_and_tail[..replacement_end];
416        let tail = replacement_and_tail[replacement_end + 1..].trim_start();
417        if !tail.is_empty() && !tail.starts_with('#') {
418            return Err(Self::parse_error(
419                "macro definition must be: macro name \"replacement\"".to_string(),
420                line_file,
421            ));
422        }
423
424        let replacement_tokens = self.raw_tokenize_line(replacement_source);
425        if replacement_tokens
426            .iter()
427            .any(|token| Self::is_macro_use_token(token) || token == "macro")
428        {
429            return Err(Self::parse_error(
430                "macro replacement cannot contain macro use or macro definition".to_string(),
431                line_file,
432            ));
433        }
434
435        let current_scope = self.macros_in_block_scopes.last_mut().ok_or_else(|| {
436            Self::parse_error(
437                "Internal tokenizer macro scope missing".to_string(),
438                line_file,
439            )
440        })?;
441        current_scope.insert(name.to_string(), replacement_tokens);
442        Ok(true)
443    }
444
445    fn find_macro(&self, name: &str) -> Option<&Vec<String>> {
446        self.macros_in_block_scopes
447            .iter()
448            .rev()
449            .find_map(|scope| scope.get(name))
450    }
451
452    fn is_macro_use_token(token: &str) -> bool {
453        let Some(name) = token.strip_prefix('@') else {
454            return false;
455        };
456        Self::is_macro_name(name)
457    }
458
459    fn is_macro_name(name: &str) -> bool {
460        let mut chars = name.chars();
461        match chars.next() {
462            Some(ch) if ch.is_ascii_alphabetic() || ch == '_' => {}
463            _ => return false,
464        }
465        chars.all(|ch| ch.is_ascii_alphanumeric() || ch == '_')
466    }
467
468    fn parse_error(message: String, line_file: LineFile) -> RuntimeError {
469        RuntimeError::from(ParseRuntimeError(
470            RuntimeErrorStruct::new_with_msg_and_line_file(message, line_file),
471        ))
472    }
473}
474
475#[cfg(test)]
476mod tests {
477    use super::Tokenizer;
478    use crate::prelude::*;
479    use std::rc::Rc;
480
481    fn test_line_file() -> LineFile {
482        (1, Rc::from("test.lit"))
483    }
484
485    #[test]
486    fn exist_bang_splits_into_exist_and_bang() {
487        let tokenizer = Tokenizer::new();
488        assert_eq!(
489            tokenizer
490                .tokenize_line("exist! a R st {}", test_line_file())
491                .unwrap(),
492            vec!["exist", "!", "a", "R", "st", "{", "}"]
493        );
494    }
495
496    #[test]
497    fn forall_bang_splits_like_exist_bang() {
498        let tokenizer = Tokenizer::new();
499        assert_eq!(
500            tokenizer
501                .tokenize_line("forall! x R: x > 0", test_line_file())
502                .unwrap(),
503            vec!["forall", "!", "x", "R", ":", "x", ">", "0"]
504        );
505    }
506
507    #[test]
508    fn exist_bang_with_whitespace() {
509        let tokenizer = Tokenizer::new();
510        assert_eq!(
511            tokenizer
512                .tokenize_line("exist ! a R st {}", test_line_file())
513                .unwrap(),
514            vec!["exist", "!", "a", "R", "st", "{", "}"]
515        );
516    }
517
518    #[test]
519    fn macro_definition_expands_later_line_in_same_block() {
520        let mut tokenizer = Tokenizer::new();
521        let blocks = tokenizer
522            .parse_blocks("macro eq \"a = b\"\nhave @eq", Rc::from("test.lit"))
523            .unwrap();
524        assert_eq!(blocks.len(), 1);
525        assert_eq!(blocks[0].header, vec!["have", "a", "=", "b"]);
526    }
527
528    #[test]
529    fn macro_scope_is_visible_in_child_block() {
530        let mut tokenizer = Tokenizer::new();
531        let blocks = tokenizer
532            .parse_blocks(
533                "macro eq \"a = b\"\nprove claim:\n    have @eq",
534                Rc::from("test.lit"),
535            )
536            .unwrap();
537        assert_eq!(blocks[0].body[0].header, vec!["have", "a", "=", "b"]);
538    }
539
540    #[test]
541    fn macro_scope_expires_after_block() {
542        let mut tokenizer = Tokenizer::new();
543        let error = tokenizer
544            .parse_blocks(
545                "prove claim:\n    macro eq \"a = b\"\n    have @eq\nhave @eq",
546                Rc::from("test.lit"),
547            )
548            .unwrap_err();
549        assert!(format!("{:?}", error).contains("Unknown macro: @eq"));
550    }
551
552    #[test]
553    fn inner_macro_overrides_outer_macro_only_inside_child_block() {
554        let mut tokenizer = Tokenizer::new();
555        let blocks = tokenizer
556            .parse_blocks(
557                "macro x \"outer\"\nprove claim:\n    macro x \"inner\"\n    have @x\nhave @x",
558                Rc::from("test.lit"),
559            )
560            .unwrap();
561        assert_eq!(blocks[0].body[0].header, vec!["have", "inner"]);
562        assert_eq!(blocks[1].header, vec!["have", "outer"]);
563    }
564
565    #[test]
566    fn macro_replacement_cannot_use_macro() {
567        let mut tokenizer = Tokenizer::new();
568        let error = tokenizer
569            .parse_blocks("macro x \"@other\"", Rc::from("test.lit"))
570            .unwrap_err();
571        assert!(format!("{:?}", error).contains("macro replacement cannot contain"));
572    }
573}