Skip to main content

miden_assembly_syntax/parser/
mod.rs

1/// Simple macro used in the grammar definition for constructing spans
2macro_rules! span {
3    ($id:expr, $l:expr, $r:expr) => {
4        ::miden_debug_types::SourceSpan::new($id, $l..$r)
5    };
6    ($id:expr, $i:expr) => {
7        ::miden_debug_types::SourceSpan::at($id, $i)
8    };
9}
10
11lalrpop_util::lalrpop_mod!(
12    #[expect(clippy::all)]
13    grammar,
14    "/parser/grammar.rs"
15);
16
17mod error;
18mod lexer;
19mod scanner;
20mod token;
21
22use alloc::{boxed::Box, collections::BTreeSet, string::ToString, sync::Arc, vec::Vec};
23
24use miden_debug_types::{SourceFile, SourceLanguage, SourceManager, Uri};
25use miden_utils_diagnostics::Report;
26
27pub use self::{
28    error::{BinErrorKind, HexErrorKind, LiteralErrorKind, ParsingError},
29    lexer::Lexer,
30    scanner::Scanner,
31    token::{BinEncodedValue, DocumentationType, IntValue, PushValue, Token, WordValue},
32};
33use crate::{Path, ast, sema};
34
35// TYPE ALIASES
36// ================================================================================================
37
38type ParseError<'a> = lalrpop_util::ParseError<u32, Token<'a>, ParsingError>;
39
40// MODULE PARSER
41// ================================================================================================
42
43/// This is a wrapper around the lower-level parser infrastructure which handles orchestrating all
44/// of the pieces needed to parse a [ast::Module] from source, and run semantic analysis on it.
45#[derive(Default)]
46pub struct ModuleParser {
47    /// The kind of module we're parsing.
48    ///
49    /// This is used when performing semantic analysis to detect when various invalid constructions
50    /// are encountered, such as use of the `syscall` instruction in a kernel module.
51    kind: ast::ModuleKind,
52    /// A set of interned strings allocated during parsing/semantic analysis.
53    ///
54    /// This is a very primitive and imprecise way of interning strings, but was the least invasive
55    /// at the time the new parser was implemented. In essence, we avoid duplicating allocations
56    /// for frequently occurring strings, by tracking which strings we've seen before, and
57    /// sharing a reference counted pointer instead.
58    ///
59    /// We may want to replace this eventually with a proper interner, so that we can also gain the
60    /// benefits commonly provided by interned string handles (e.g. cheap equality comparisons, no
61    /// ref- counting overhead, copyable and of smaller size).
62    ///
63    /// Note that [Ident], [ProcedureName], [LibraryPath] and others are all implemented in terms
64    /// of either the actual reference-counted string, e.g. `Arc<str>`, or in terms of [Ident],
65    /// which is essentially the former wrapped in a [SourceSpan]. If we ever replace this with
66    /// a better interner, we will also want to update those types to be in terms of whatever
67    /// the handle type of the interner is.
68    interned: BTreeSet<Arc<str>>,
69    /// When true, all warning diagnostics are promoted to error severity
70    warnings_as_errors: bool,
71}
72
73impl ModuleParser {
74    /// Construct a new parser for the given `kind` of [ast::Module].
75    pub fn new(kind: ast::ModuleKind) -> Self {
76        Self {
77            kind,
78            interned: Default::default(),
79            warnings_as_errors: false,
80        }
81    }
82
83    /// Configure this parser so that any warning diagnostics are promoted to errors.
84    pub fn set_warnings_as_errors(&mut self, yes: bool) {
85        self.warnings_as_errors = yes;
86    }
87
88    /// Parse a [ast::Module] from `source`, and give it the provided `path`.
89    pub fn parse(
90        &mut self,
91        path: impl AsRef<Path>,
92        source: Arc<SourceFile>,
93        source_manager: Arc<dyn SourceManager>,
94    ) -> Result<Box<ast::Module>, Report> {
95        let path = path.as_ref();
96        if let Err(err) = Path::validate(path.as_str()) {
97            return Err(Report::msg(err.to_string()).with_source_code(source.clone()));
98        }
99        let forms = parse_forms_internal(source.clone(), &mut self.interned)
100            .map_err(|err| Report::new(err).with_source_code(source.clone()))?;
101        sema::analyze(source, self.kind, path, forms, self.warnings_as_errors, source_manager)
102            .map_err(Report::new)
103    }
104
105    /// Parse a [ast::Module], `name`, from `path`.
106    #[cfg(feature = "std")]
107    pub fn parse_file<N, P>(
108        &mut self,
109        name: N,
110        path: P,
111        source_manager: Arc<dyn SourceManager>,
112    ) -> Result<Box<ast::Module>, Report>
113    where
114        N: AsRef<Path>,
115        P: AsRef<std::path::Path>,
116    {
117        use miden_debug_types::SourceManagerExt;
118        use miden_utils_diagnostics::{IntoDiagnostic, WrapErr};
119
120        let path = path.as_ref();
121        let source_file = source_manager
122            .load_file(path)
123            .into_diagnostic()
124            .wrap_err_with(|| format!("failed to load source file from '{}'", path.display()))?;
125        self.parse(name, source_file, source_manager)
126    }
127
128    /// Parse a [ast::Module], `name`, from `source`.
129    pub fn parse_str(
130        &mut self,
131        name: impl AsRef<Path>,
132        source: impl ToString,
133        source_manager: Arc<dyn SourceManager>,
134    ) -> Result<Box<ast::Module>, Report> {
135        use miden_debug_types::SourceContent;
136
137        let name = name.as_ref();
138        let uri = Uri::from(name.as_str().to_string().into_boxed_str());
139        let content = SourceContent::new(
140            SourceLanguage::Masm,
141            uri.clone(),
142            source.to_string().into_boxed_str(),
143        );
144        let source_file = source_manager.load_from_raw_parts(uri, content);
145        self.parse(name, source_file, source_manager)
146    }
147}
148
149/// This is used in tests to parse `source` as a set of raw [ast::Form]s rather than as a
150/// [ast::Module].
151///
152/// NOTE: This does _not_ run semantic analysis.
153#[cfg(any(test, feature = "testing"))]
154pub fn parse_forms(source: Arc<SourceFile>) -> Result<Vec<ast::Form>, ParsingError> {
155    let mut interned = BTreeSet::default();
156    parse_forms_internal(source, &mut interned)
157}
158
159/// Parse `source` as a set of [ast::Form]s
160///
161/// Aside from catching syntax errors, this does little validation of the resulting forms, that is
162/// handled by semantic analysis, which the caller is expected to perform next.
163fn parse_forms_internal(
164    source: Arc<SourceFile>,
165    interned: &mut BTreeSet<Arc<str>>,
166) -> Result<Vec<ast::Form>, ParsingError> {
167    let source_id = source.id();
168    let scanner = Scanner::new(source.as_str());
169    let lexer = Lexer::new(source_id, scanner);
170    let felt_type = Arc::new(ast::types::ArrayType::new(ast::types::Type::Felt, 4));
171    grammar::FormsParser::new()
172        .parse(source_id, interned, &felt_type, core::marker::PhantomData, lexer)
173        .map_err(|err| ParsingError::from_parse_error(source_id, err))
174}
175
176// DIRECTORY PARSER
177// ================================================================================================
178
179/// Read the contents (modules) of this library from `dir`, returning any errors that occur
180/// while traversing the file system.
181///
182/// Errors may also be returned if traversal discovers issues with the modules, such as
183/// invalid names, etc.
184///
185/// Returns an iterator over all parsed modules.
186#[cfg(feature = "std")]
187pub fn read_modules_from_dir(
188    dir: impl AsRef<std::path::Path>,
189    namespace: impl AsRef<Path>,
190    source_manager: Arc<dyn SourceManager>,
191    warnings_as_errors: bool,
192) -> Result<impl Iterator<Item = Box<ast::Module>>, Report> {
193    use std::collections::{BTreeMap, btree_map::Entry};
194
195    use miden_utils_diagnostics::{IntoDiagnostic, WrapErr, report};
196    use module_walker::{ModuleEntry, WalkModules};
197
198    let dir = dir.as_ref();
199    if !dir.is_dir() {
200        return Err(report!("the provided path '{}' is not a valid directory", dir.display()));
201    }
202
203    // mod.masm is not allowed in the root directory
204    if dir.join(ast::Module::ROOT_FILENAME).exists() {
205        return Err(report!("{} is not allowed in the root directory", ast::Module::ROOT_FILENAME));
206    }
207
208    let mut modules = BTreeMap::default();
209
210    let walker = WalkModules::new(namespace.as_ref().to_path_buf(), dir)
211        .into_diagnostic()
212        .wrap_err_with(|| format!("failed to load modules from '{}'", dir.display()))?;
213    for entry in walker {
214        let ModuleEntry { mut name, source_path } = entry?;
215        if name.last().unwrap() == ast::Module::ROOT {
216            name.pop();
217        }
218
219        // Parse module at the given path
220        let mut parser = ModuleParser::new(ast::ModuleKind::Library);
221        parser.set_warnings_as_errors(warnings_as_errors);
222        let ast = parser.parse_file(&name, &source_path, source_manager.clone())?;
223        match modules.entry(name) {
224            Entry::Occupied(ref entry) => {
225                return Err(report!("duplicate module '{0}'", entry.key().clone()));
226            },
227            Entry::Vacant(entry) => {
228                entry.insert(ast);
229            },
230        }
231    }
232
233    Ok(modules.into_values())
234}
235
236#[cfg(feature = "std")]
237mod module_walker {
238    use std::{
239        ffi::OsStr,
240        fs::{self, DirEntry, FileType},
241        io,
242        path::{Path, PathBuf},
243    };
244
245    use miden_utils_diagnostics::{IntoDiagnostic, Report, report};
246
247    use crate::{Path as LibraryPath, PathBuf as LibraryPathBuf, ast::Module};
248
249    pub struct ModuleEntry {
250        pub name: LibraryPathBuf,
251        pub source_path: PathBuf,
252    }
253
254    pub struct WalkModules<'a> {
255        namespace: LibraryPathBuf,
256        root: &'a Path,
257        stack: alloc::collections::VecDeque<io::Result<DirEntry>>,
258    }
259
260    impl<'a> WalkModules<'a> {
261        pub fn new(namespace: LibraryPathBuf, path: &'a Path) -> io::Result<Self> {
262            use alloc::collections::VecDeque;
263
264            let stack = VecDeque::from_iter(fs::read_dir(path)?);
265
266            Ok(Self { namespace, root: path, stack })
267        }
268
269        fn next_entry(
270            &mut self,
271            entry: &DirEntry,
272            ty: &FileType,
273        ) -> Result<Option<ModuleEntry>, Report> {
274            if ty.is_dir() {
275                let dir = entry.path();
276                self.stack.extend(fs::read_dir(dir).into_diagnostic()?);
277                return Ok(None);
278            }
279
280            let mut file_path = entry.path();
281            let is_module = file_path
282                .extension()
283                .map(|ext| ext == AsRef::<OsStr>::as_ref(Module::FILE_EXTENSION))
284                .unwrap_or(false);
285            if !is_module {
286                return Ok(None);
287            }
288
289            // Remove the file extension and the root prefix, leaving a namespace-relative path
290            file_path.set_extension("");
291            if file_path.is_dir() {
292                return Err(report!(
293                    "file and directory with same name are not allowed: {}",
294                    file_path.display()
295                ));
296            }
297            let relative_path = file_path
298                .strip_prefix(self.root)
299                .expect("expected path to be a child of the root directory");
300
301            // Construct a [LibraryPath] from the path components, after validating them
302            let mut libpath = self.namespace.clone();
303            for component in relative_path.iter() {
304                let component = component.to_str().ok_or_else(|| {
305                    let p = entry.path();
306                    report!("{} is an invalid directory entry", p.display())
307                })?;
308                LibraryPath::validate(component).into_diagnostic()?;
309                libpath.push(component);
310            }
311            Ok(Some(ModuleEntry { name: libpath, source_path: entry.path() }))
312        }
313    }
314
315    impl Iterator for WalkModules<'_> {
316        type Item = Result<ModuleEntry, Report>;
317
318        fn next(&mut self) -> Option<Self::Item> {
319            loop {
320                let entry = self
321                    .stack
322                    .pop_front()?
323                    .and_then(|entry| entry.file_type().map(|ft| (entry, ft)))
324                    .into_diagnostic();
325
326                match entry {
327                    Ok((ref entry, ref file_type)) => {
328                        match self.next_entry(entry, file_type).transpose() {
329                            None => continue,
330                            result => break result,
331                        }
332                    },
333                    Err(err) => break Some(Err(err)),
334                }
335            }
336        }
337    }
338}
339
340// TESTS
341// ================================================================================================
342
343#[cfg(test)]
344mod tests {
345    use miden_core::assert_matches;
346    use miden_debug_types::SourceId;
347
348    use super::*;
349
350    // This test checks the lexer behavior with regard to tokenizing `exp(.u?[\d]+)?`
351    #[test]
352    fn lex_exp() {
353        let source_id = SourceId::default();
354        let scanner = Scanner::new("begin exp.u9 end");
355        let mut lexer = Lexer::new(source_id, scanner).map(|result| result.map(|(_, t, _)| t));
356        assert_matches!(lexer.next(), Some(Ok(Token::Begin)));
357        assert_matches!(lexer.next(), Some(Ok(Token::ExpU)));
358        assert_matches!(lexer.next(), Some(Ok(Token::Int(n))) if n == 9);
359        assert_matches!(lexer.next(), Some(Ok(Token::End)));
360    }
361
362    #[test]
363    fn lex_block() {
364        let source_id = SourceId::default();
365        let scanner = Scanner::new(
366            "\
367const ERR1 = 1
368
369begin
370    u32assertw
371    u32assertw.err=ERR1
372    u32assertw.err=2
373end
374",
375        );
376        let mut lexer = Lexer::new(source_id, scanner).map(|result| result.map(|(_, t, _)| t));
377        assert_matches!(lexer.next(), Some(Ok(Token::Const)));
378        assert_matches!(lexer.next(), Some(Ok(Token::ConstantIdent("ERR1"))));
379        assert_matches!(lexer.next(), Some(Ok(Token::Equal)));
380        assert_matches!(lexer.next(), Some(Ok(Token::Int(1))));
381        assert_matches!(lexer.next(), Some(Ok(Token::Begin)));
382        assert_matches!(lexer.next(), Some(Ok(Token::U32Assertw)));
383        assert_matches!(lexer.next(), Some(Ok(Token::U32Assertw)));
384        assert_matches!(lexer.next(), Some(Ok(Token::Dot)));
385        assert_matches!(lexer.next(), Some(Ok(Token::Err)));
386        assert_matches!(lexer.next(), Some(Ok(Token::Equal)));
387        assert_matches!(lexer.next(), Some(Ok(Token::ConstantIdent("ERR1"))));
388        assert_matches!(lexer.next(), Some(Ok(Token::U32Assertw)));
389        assert_matches!(lexer.next(), Some(Ok(Token::Dot)));
390        assert_matches!(lexer.next(), Some(Ok(Token::Err)));
391        assert_matches!(lexer.next(), Some(Ok(Token::Equal)));
392        assert_matches!(lexer.next(), Some(Ok(Token::Int(2))));
393        assert_matches!(lexer.next(), Some(Ok(Token::End)));
394        assert_matches!(lexer.next(), Some(Ok(Token::Eof)));
395    }
396
397    #[test]
398    fn lex_emit() {
399        let source_id = SourceId::default();
400        let scanner = Scanner::new(
401            "\
402begin
403    push.1
404    emit.event(\"abc\")
405end
406",
407        );
408        let mut lexer = Lexer::new(source_id, scanner).map(|result| result.map(|(_, t, _)| t));
409        assert_matches!(lexer.next(), Some(Ok(Token::Begin)));
410        assert_matches!(lexer.next(), Some(Ok(Token::Push)));
411        assert_matches!(lexer.next(), Some(Ok(Token::Dot)));
412        assert_matches!(lexer.next(), Some(Ok(Token::Int(1))));
413        assert_matches!(lexer.next(), Some(Ok(Token::Emit)));
414        assert_matches!(lexer.next(), Some(Ok(Token::Dot)));
415        assert_matches!(lexer.next(), Some(Ok(Token::Event)));
416        assert_matches!(lexer.next(), Some(Ok(Token::Lparen)));
417        assert_matches!(lexer.next(), Some(Ok(Token::QuotedIdent("abc"))));
418        assert_matches!(lexer.next(), Some(Ok(Token::Rparen)));
419        assert_matches!(lexer.next(), Some(Ok(Token::End)));
420        assert_matches!(lexer.next(), Some(Ok(Token::Eof)));
421    }
422
423    #[test]
424    fn lex_invalid_token_after_whitespace_returns_error() {
425        let source_id = SourceId::default();
426        let scanner = Scanner::new("begin \u{0001}\nend\n");
427        let mut lexer = Lexer::new(source_id, scanner).map(|result| result.map(|(_, t, _)| t));
428
429        assert_matches!(lexer.next(), Some(Ok(Token::Begin)));
430        assert_matches!(
431            lexer.next(),
432            Some(Err(ParsingError::InvalidToken { span })) if span.into_range() == (6..7)
433        );
434    }
435
436    #[test]
437    fn lex_invalid_underscore_token_span() {
438        let source_id = SourceId::default();
439        let scanner = Scanner::new("begin _-\nend\n");
440        let mut lexer = Lexer::new(source_id, scanner).map(|result| result.map(|(_, t, _)| t));
441
442        assert_matches!(lexer.next(), Some(Ok(Token::Begin)));
443        assert_matches!(
444            lexer.next(),
445            Some(Err(ParsingError::InvalidToken { span })) if span.into_range() == (6..7)
446        );
447    }
448
449    #[test]
450    fn lex_single_char_token_and_ident_spans() {
451        let source_id = SourceId::default();
452        let scanner = Scanner::new("@\nA\n");
453        let mut lexer = Lexer::new(source_id, scanner);
454
455        assert_matches!(lexer.next(), Some(Ok((0, Token::At, 1))));
456        assert_matches!(lexer.next(), Some(Ok((2, Token::ConstantIdent("A"), 3))));
457    }
458
459    #[test]
460    fn overlong_path_component_is_rejected_without_panic() {
461        use std::{
462            panic::{AssertUnwindSafe, catch_unwind},
463            sync::Arc,
464        };
465
466        use crate::{
467            debuginfo::DefaultSourceManager,
468            parse::{Parse, ParseOptions},
469        };
470
471        let big_component = "a".repeat(256);
472        let source = format!("begin\n    exec.{big_component}::x::foo\nend\n");
473
474        let source_manager = Arc::new(DefaultSourceManager::default());
475        let parsed = catch_unwind(AssertUnwindSafe(|| {
476            source.parse_with_options(source_manager, ParseOptions::default())
477        }));
478
479        assert!(parsed.is_ok(), "parsing panicked, expected a structured error");
480        let err = parsed.unwrap().expect_err("parsing succeeded, expected an error");
481        crate::assert_diagnostic!(err, "this reference is invalid without a corresponding import");
482    }
483}