Skip to main content

miden_assembly_syntax/parser/
mod.rs

1/// Simple macro used in the grammar definition for constructing spans
2macro_rules! span {
3    ($id:expr, $l:expr, $r:expr) => {
4        ::miden_debug_types::SourceSpan::new($id, $l..$r)
5    };
6    ($id:expr, $i:expr) => {
7        ::miden_debug_types::SourceSpan::at($id, $i)
8    };
9}
10
11lalrpop_util::lalrpop_mod!(
12    #[expect(clippy::all)]
13    #[expect(clippy::redundant_closure_for_method_calls)]
14    #[expect(clippy::trivially_copy_pass_by_ref)]
15    #[expect(unused_lifetimes)]
16    #[expect(unused_qualifications)]
17    grammar,
18    "/parser/grammar.rs"
19);
20
21mod error;
22mod lexer;
23mod scanner;
24mod token;
25
26use alloc::{boxed::Box, collections::BTreeSet, string::ToString, sync::Arc, vec::Vec};
27
28use miden_debug_types::{SourceFile, SourceLanguage, SourceManager, Uri};
29use miden_utils_diagnostics::Report;
30
31pub use self::{
32    error::{BinErrorKind, HexErrorKind, LiteralErrorKind, ParsingError},
33    lexer::Lexer,
34    scanner::Scanner,
35    token::{BinEncodedValue, DocumentationType, IntValue, PushValue, Token, WordValue},
36};
37use crate::{Path, ast, sema};
38
39// TYPE ALIASES
40// ================================================================================================
41
42type ParseError<'a> = lalrpop_util::ParseError<u32, Token<'a>, ParsingError>;
43
44// MODULE PARSER
45// ================================================================================================
46
47/// This is a wrapper around the lower-level parser infrastructure which handles orchestrating all
48/// of the pieces needed to parse a [ast::Module] from source, and run semantic analysis on it.
49#[derive(Default)]
50pub struct ModuleParser {
51    /// The kind of module we're parsing.
52    ///
53    /// This is used when performing semantic analysis to detect when various invalid constructions
54    /// are encountered, such as use of the `syscall` instruction in a kernel module.
55    kind: ast::ModuleKind,
56    /// A set of interned strings allocated during parsing/semantic analysis.
57    ///
58    /// This is a very primitive and imprecise way of interning strings, but was the least invasive
59    /// at the time the new parser was implemented. In essence, we avoid duplicating allocations
60    /// for frequently occurring strings, by tracking which strings we've seen before, and
61    /// sharing a reference counted pointer instead.
62    ///
63    /// We may want to replace this eventually with a proper interner, so that we can also gain the
64    /// benefits commonly provided by interned string handles (e.g. cheap equality comparisons, no
65    /// ref- counting overhead, copyable and of smaller size).
66    ///
67    /// Note that [Ident], [ProcedureName], [LibraryPath] and others are all implemented in terms
68    /// of either the actual reference-counted string, e.g. `Arc<str>`, or in terms of [Ident],
69    /// which is essentially the former wrapped in a [SourceSpan]. If we ever replace this with
70    /// a better interner, we will also want to update those types to be in terms of whatever
71    /// the handle type of the interner is.
72    interned: BTreeSet<Arc<str>>,
73    /// When true, all warning diagnostics are promoted to error severity
74    warnings_as_errors: bool,
75}
76
77impl ModuleParser {
78    /// Construct a new parser for the given `kind` of [ast::Module].
79    pub fn new(kind: ast::ModuleKind) -> Self {
80        Self {
81            kind,
82            interned: Default::default(),
83            warnings_as_errors: false,
84        }
85    }
86
87    /// Configure this parser so that any warning diagnostics are promoted to errors.
88    pub fn set_warnings_as_errors(&mut self, yes: bool) {
89        self.warnings_as_errors = yes;
90    }
91
92    /// Parse a [ast::Module] from `source`, and give it the provided `path`.
93    pub fn parse(
94        &mut self,
95        path: impl AsRef<Path>,
96        source: Arc<SourceFile>,
97        source_manager: Arc<dyn SourceManager>,
98    ) -> Result<Box<ast::Module>, Report> {
99        let path = path.as_ref();
100        if let Err(err) = Path::validate(path.as_str()) {
101            return Err(Report::msg(err.to_string()).with_source_code(source));
102        }
103        let forms = parse_forms_internal(source.clone(), &mut self.interned)
104            .map_err(|err| Report::new(err).with_source_code(source.clone()))?;
105        sema::analyze(source, self.kind, path, forms, self.warnings_as_errors, source_manager)
106            .map_err(Report::new)
107    }
108
109    /// Parse a [ast::Module], `name`, from `path`.
110    #[cfg(feature = "std")]
111    pub fn parse_file<N, P>(
112        &mut self,
113        name: N,
114        path: P,
115        source_manager: Arc<dyn SourceManager>,
116    ) -> Result<Box<ast::Module>, Report>
117    where
118        N: AsRef<Path>,
119        P: AsRef<std::path::Path>,
120    {
121        use miden_debug_types::SourceManagerExt;
122        use miden_utils_diagnostics::{IntoDiagnostic, WrapErr};
123
124        let path = path.as_ref();
125        let source_file = source_manager
126            .load_file(path)
127            .into_diagnostic()
128            .wrap_err_with(|| format!("failed to load source file from '{}'", path.display()))?;
129        self.parse(name, source_file, source_manager)
130    }
131
132    /// Parse a [ast::Module], `name`, from `source`.
133    pub fn parse_str(
134        &mut self,
135        name: impl AsRef<Path>,
136        source: impl ToString,
137        source_manager: Arc<dyn SourceManager>,
138    ) -> Result<Box<ast::Module>, Report> {
139        use miden_debug_types::SourceContent;
140
141        let name = name.as_ref();
142        let uri = Uri::from(name.as_str().to_string().into_boxed_str());
143        let content = SourceContent::new(
144            SourceLanguage::Masm,
145            uri.clone(),
146            source.to_string().into_boxed_str(),
147        );
148        let source_file = source_manager.load_from_raw_parts(uri, content);
149        self.parse(name, source_file, source_manager)
150    }
151}
152
153/// This is used in tests to parse `source` as a set of raw [ast::Form]s rather than as a
154/// [ast::Module].
155///
156/// NOTE: This does _not_ run semantic analysis.
157#[cfg(any(test, feature = "testing"))]
158pub fn parse_forms(source: Arc<SourceFile>) -> Result<Vec<ast::Form>, ParsingError> {
159    let mut interned = BTreeSet::default();
160    parse_forms_internal(source, &mut interned)
161}
162
163/// Parse `source` as a set of [ast::Form]s
164///
165/// Aside from catching syntax errors, this does little validation of the resulting forms, that is
166/// handled by semantic analysis, which the caller is expected to perform next.
167fn parse_forms_internal(
168    source: Arc<SourceFile>,
169    interned: &mut BTreeSet<Arc<str>>,
170) -> Result<Vec<ast::Form>, ParsingError> {
171    let source_id = source.id();
172    let scanner = Scanner::new(source.as_str());
173    let lexer = Lexer::new(source_id, scanner);
174    let felt_type = Arc::new(ast::types::ArrayType::new(ast::types::Type::Felt, 4));
175    grammar::FormsParser::new()
176        .parse(source_id, interned, &felt_type, core::marker::PhantomData, lexer)
177        .map_err(|err| ParsingError::from_parse_error(source_id, err))
178}
179
180// DIRECTORY PARSER
181// ================================================================================================
182
183/// Read the contents (modules) of this library from `dir`, returning any errors that occur
184/// while traversing the file system.
185///
186/// Errors may also be returned if traversal discovers issues with the modules, such as
187/// invalid names, etc.
188///
189/// Returns an iterator over all parsed modules.
190#[cfg(feature = "std")]
191pub fn read_modules_from_dir(
192    dir: impl AsRef<std::path::Path>,
193    namespace: impl AsRef<Path>,
194    source_manager: Arc<dyn SourceManager>,
195    warnings_as_errors: bool,
196) -> Result<impl Iterator<Item = Box<ast::Module>>, Report> {
197    use std::collections::{BTreeMap, btree_map::Entry};
198
199    use miden_utils_diagnostics::{IntoDiagnostic, WrapErr, report};
200    use module_walker::{ModuleEntry, WalkModules};
201
202    let dir = dir.as_ref();
203    if !dir.is_dir() {
204        return Err(report!("the provided path '{}' is not a valid directory", dir.display()));
205    }
206
207    // mod.masm is not allowed in the root directory
208    if dir.join(ast::Module::ROOT_FILENAME).exists() {
209        return Err(report!("{} is not allowed in the root directory", ast::Module::ROOT_FILENAME));
210    }
211
212    let mut modules = BTreeMap::default();
213
214    let walker = WalkModules::new(namespace.as_ref().to_path_buf(), dir)
215        .into_diagnostic()
216        .wrap_err_with(|| format!("failed to load modules from '{}'", dir.display()))?;
217    for entry in walker {
218        let ModuleEntry { mut name, source_path } = entry?;
219        if name.last().unwrap() == ast::Module::ROOT {
220            name.pop();
221        }
222
223        // Parse module at the given path
224        let mut parser = ModuleParser::new(ast::ModuleKind::Library);
225        parser.set_warnings_as_errors(warnings_as_errors);
226        let ast = parser.parse_file(&name, &source_path, source_manager.clone())?;
227        match modules.entry(name) {
228            Entry::Occupied(ref entry) => {
229                return Err(report!("duplicate module '{0}'", entry.key().clone()));
230            },
231            Entry::Vacant(entry) => {
232                entry.insert(ast);
233            },
234        }
235    }
236
237    Ok(modules.into_values())
238}
239
240#[cfg(feature = "std")]
241mod module_walker {
242    use std::{
243        ffi::OsStr,
244        fs::{self, DirEntry, FileType},
245        io,
246        path::{Path, PathBuf},
247    };
248
249    use miden_utils_diagnostics::{IntoDiagnostic, Report, report};
250
251    use crate::{Path as LibraryPath, PathBuf as LibraryPathBuf, ast::Module};
252
253    pub struct ModuleEntry {
254        pub name: LibraryPathBuf,
255        pub source_path: PathBuf,
256    }
257
258    pub struct WalkModules<'a> {
259        namespace: LibraryPathBuf,
260        root: &'a Path,
261        stack: alloc::collections::VecDeque<io::Result<DirEntry>>,
262    }
263
264    impl<'a> WalkModules<'a> {
265        pub fn new(namespace: LibraryPathBuf, path: &'a Path) -> io::Result<Self> {
266            use alloc::collections::VecDeque;
267
268            let stack = VecDeque::from_iter(fs::read_dir(path)?);
269
270            Ok(Self { namespace, root: path, stack })
271        }
272
273        fn next_entry(
274            &mut self,
275            entry: &DirEntry,
276            ty: FileType,
277        ) -> Result<Option<ModuleEntry>, Report> {
278            if ty.is_dir() {
279                let dir = entry.path();
280                self.stack.extend(fs::read_dir(dir).into_diagnostic()?);
281                return Ok(None);
282            }
283
284            let mut file_path = entry.path();
285            let is_module = file_path
286                .extension()
287                .map(|ext| ext == AsRef::<OsStr>::as_ref(Module::FILE_EXTENSION))
288                .unwrap_or(false);
289            if !is_module {
290                return Ok(None);
291            }
292
293            // Remove the file extension and the root prefix, leaving a namespace-relative path
294            file_path.set_extension("");
295            if file_path.is_dir() {
296                return Err(report!(
297                    "file and directory with same name are not allowed: {}",
298                    file_path.display()
299                ));
300            }
301            let relative_path = file_path
302                .strip_prefix(self.root)
303                .expect("expected path to be a child of the root directory");
304
305            // Construct a [LibraryPath] from the path components, after validating them
306            let mut libpath = self.namespace.clone();
307            for component in relative_path.iter() {
308                let component = component.to_str().ok_or_else(|| {
309                    let p = entry.path();
310                    report!("{} is an invalid directory entry", p.display())
311                })?;
312                LibraryPath::validate(component).into_diagnostic()?;
313                libpath.push(component);
314            }
315            Ok(Some(ModuleEntry { name: libpath, source_path: entry.path() }))
316        }
317    }
318
319    impl Iterator for WalkModules<'_> {
320        type Item = Result<ModuleEntry, Report>;
321
322        fn next(&mut self) -> Option<Self::Item> {
323            loop {
324                let entry = self
325                    .stack
326                    .pop_front()?
327                    .and_then(|entry| entry.file_type().map(|ft| (entry, ft)))
328                    .into_diagnostic();
329
330                match entry {
331                    Ok((ref entry, file_type)) => {
332                        match self.next_entry(entry, file_type).transpose() {
333                            None => {},
334                            result => break result,
335                        }
336                    },
337                    Err(err) => break Some(Err(err)),
338                }
339            }
340        }
341    }
342}
343
344// TESTS
345// ================================================================================================
346
347#[cfg(test)]
348mod tests {
349    use miden_core::assert_matches;
350    use miden_debug_types::SourceId;
351
352    use super::*;
353
354    // This test checks the lexer behavior with regard to tokenizing `exp(.u?[\d]+)?`
355    #[test]
356    fn lex_exp() {
357        let source_id = SourceId::default();
358        let scanner = Scanner::new("begin exp.u9 end");
359        let mut lexer = Lexer::new(source_id, scanner).map(|result| result.map(|(_, t, _)| t));
360        assert_matches!(lexer.next(), Some(Ok(Token::Begin)));
361        assert_matches!(lexer.next(), Some(Ok(Token::ExpU)));
362        assert_matches!(lexer.next(), Some(Ok(Token::Int(n))) if n == 9);
363        assert_matches!(lexer.next(), Some(Ok(Token::End)));
364    }
365
366    #[test]
367    fn lex_block() {
368        let source_id = SourceId::default();
369        let scanner = Scanner::new(
370            "\
371const ERR1 = 1
372
373begin
374    u32assertw
375    u32assertw.err=ERR1
376    u32assertw.err=2
377end
378",
379        );
380        let mut lexer = Lexer::new(source_id, scanner).map(|result| result.map(|(_, t, _)| t));
381        assert_matches!(lexer.next(), Some(Ok(Token::Const)));
382        assert_matches!(lexer.next(), Some(Ok(Token::ConstantIdent("ERR1"))));
383        assert_matches!(lexer.next(), Some(Ok(Token::Equal)));
384        assert_matches!(lexer.next(), Some(Ok(Token::Int(1))));
385        assert_matches!(lexer.next(), Some(Ok(Token::Begin)));
386        assert_matches!(lexer.next(), Some(Ok(Token::U32Assertw)));
387        assert_matches!(lexer.next(), Some(Ok(Token::U32Assertw)));
388        assert_matches!(lexer.next(), Some(Ok(Token::Dot)));
389        assert_matches!(lexer.next(), Some(Ok(Token::Err)));
390        assert_matches!(lexer.next(), Some(Ok(Token::Equal)));
391        assert_matches!(lexer.next(), Some(Ok(Token::ConstantIdent("ERR1"))));
392        assert_matches!(lexer.next(), Some(Ok(Token::U32Assertw)));
393        assert_matches!(lexer.next(), Some(Ok(Token::Dot)));
394        assert_matches!(lexer.next(), Some(Ok(Token::Err)));
395        assert_matches!(lexer.next(), Some(Ok(Token::Equal)));
396        assert_matches!(lexer.next(), Some(Ok(Token::Int(2))));
397        assert_matches!(lexer.next(), Some(Ok(Token::End)));
398        assert_matches!(lexer.next(), Some(Ok(Token::Eof)));
399    }
400
401    #[test]
402    fn lex_emit() {
403        let source_id = SourceId::default();
404        let scanner = Scanner::new(
405            "\
406begin
407    push.1
408    emit.event(\"abc\")
409end
410",
411        );
412        let mut lexer = Lexer::new(source_id, scanner).map(|result| result.map(|(_, t, _)| t));
413        assert_matches!(lexer.next(), Some(Ok(Token::Begin)));
414        assert_matches!(lexer.next(), Some(Ok(Token::Push)));
415        assert_matches!(lexer.next(), Some(Ok(Token::Dot)));
416        assert_matches!(lexer.next(), Some(Ok(Token::Int(1))));
417        assert_matches!(lexer.next(), Some(Ok(Token::Emit)));
418        assert_matches!(lexer.next(), Some(Ok(Token::Dot)));
419        assert_matches!(lexer.next(), Some(Ok(Token::Event)));
420        assert_matches!(lexer.next(), Some(Ok(Token::Lparen)));
421        assert_matches!(lexer.next(), Some(Ok(Token::QuotedIdent("abc"))));
422        assert_matches!(lexer.next(), Some(Ok(Token::Rparen)));
423        assert_matches!(lexer.next(), Some(Ok(Token::End)));
424        assert_matches!(lexer.next(), Some(Ok(Token::Eof)));
425    }
426
427    #[test]
428    fn lex_invalid_token_after_whitespace_returns_error() {
429        let source_id = SourceId::default();
430        let scanner = Scanner::new("begin \u{0001}\nend\n");
431        let mut lexer = Lexer::new(source_id, scanner).map(|result| result.map(|(_, t, _)| t));
432
433        assert_matches!(lexer.next(), Some(Ok(Token::Begin)));
434        assert_matches!(
435            lexer.next(),
436            Some(Err(ParsingError::InvalidToken { span })) if span.into_range() == (6..7)
437        );
438    }
439
440    #[test]
441    fn lex_invalid_underscore_token_span() {
442        let source_id = SourceId::default();
443        let scanner = Scanner::new("begin _-\nend\n");
444        let mut lexer = Lexer::new(source_id, scanner).map(|result| result.map(|(_, t, _)| t));
445
446        assert_matches!(lexer.next(), Some(Ok(Token::Begin)));
447        assert_matches!(
448            lexer.next(),
449            Some(Err(ParsingError::InvalidToken { span })) if span.into_range() == (6..7)
450        );
451    }
452
453    #[test]
454    fn lex_single_char_token_and_ident_spans() {
455        let source_id = SourceId::default();
456        let scanner = Scanner::new("@\nA\n");
457        let mut lexer = Lexer::new(source_id, scanner);
458
459        assert_matches!(lexer.next(), Some(Ok((0, Token::At, 1))));
460        assert_matches!(lexer.next(), Some(Ok((2, Token::ConstantIdent("A"), 3))));
461    }
462
463    #[test]
464    fn overlong_path_component_is_rejected_without_panic() {
465        use std::{
466            panic::{AssertUnwindSafe, catch_unwind},
467            sync::Arc,
468        };
469
470        use crate::{
471            debuginfo::DefaultSourceManager,
472            parse::{Parse, ParseOptions},
473        };
474
475        let big_component = "a".repeat(u16::MAX as usize);
476        let source = format!("begin\n    exec.{big_component}::x::foo\nend\n");
477
478        let source_manager = Arc::new(DefaultSourceManager::default());
479        let parsed = catch_unwind(AssertUnwindSafe(|| {
480            source.parse_with_options(source_manager, ParseOptions::default())
481        }));
482
483        assert!(parsed.is_ok(), "parsing panicked, expected a structured error");
484        let err = parsed.unwrap().expect_err("parsing succeeded, expected an error");
485        crate::assert_diagnostic!(err, "this reference is invalid without a corresponding import");
486    }
487}