Skip to main content

miden_assembly_syntax/parser/
mod.rs

1/// Simple macro used in the grammar definition for constructing spans
2macro_rules! span {
3    ($id:expr, $l:expr, $r:expr) => {
4        ::miden_debug_types::SourceSpan::new($id, $l..$r)
5    };
6    ($id:expr, $i:expr) => {
7        ::miden_debug_types::SourceSpan::at($id, $i)
8    };
9}
10
11lalrpop_util::lalrpop_mod!(
12    #[expect(clippy::all)]
13    #[expect(unused_lifetimes)]
14    grammar,
15    "/parser/grammar.rs"
16);
17
18mod error;
19mod lexer;
20mod scanner;
21mod token;
22
23use alloc::{boxed::Box, collections::BTreeSet, string::ToString, sync::Arc, vec::Vec};
24
25use miden_debug_types::{SourceFile, SourceLanguage, SourceManager, Uri};
26use miden_utils_diagnostics::Report;
27
28pub use self::{
29    error::{BinErrorKind, HexErrorKind, LiteralErrorKind, ParsingError},
30    lexer::Lexer,
31    scanner::Scanner,
32    token::{BinEncodedValue, DocumentationType, IntValue, PushValue, Token, WordValue},
33};
34use crate::{Path, ast, sema};
35
36// TYPE ALIASES
37// ================================================================================================
38
39type ParseError<'a> = lalrpop_util::ParseError<u32, Token<'a>, ParsingError>;
40
41// MODULE PARSER
42// ================================================================================================
43
44/// This is a wrapper around the lower-level parser infrastructure which handles orchestrating all
45/// of the pieces needed to parse a [ast::Module] from source, and run semantic analysis on it.
46#[derive(Default)]
47pub struct ModuleParser {
48    /// The kind of module we're parsing.
49    ///
50    /// This is used when performing semantic analysis to detect when various invalid constructions
51    /// are encountered, such as use of the `syscall` instruction in a kernel module.
52    kind: ast::ModuleKind,
53    /// A set of interned strings allocated during parsing/semantic analysis.
54    ///
55    /// This is a very primitive and imprecise way of interning strings, but was the least invasive
56    /// at the time the new parser was implemented. In essence, we avoid duplicating allocations
57    /// for frequently occurring strings, by tracking which strings we've seen before, and
58    /// sharing a reference counted pointer instead.
59    ///
60    /// We may want to replace this eventually with a proper interner, so that we can also gain the
61    /// benefits commonly provided by interned string handles (e.g. cheap equality comparisons, no
62    /// ref- counting overhead, copyable and of smaller size).
63    ///
64    /// Note that [Ident], [ProcedureName], [LibraryPath] and others are all implemented in terms
65    /// of either the actual reference-counted string, e.g. `Arc<str>`, or in terms of [Ident],
66    /// which is essentially the former wrapped in a [SourceSpan]. If we ever replace this with
67    /// a better interner, we will also want to update those types to be in terms of whatever
68    /// the handle type of the interner is.
69    interned: BTreeSet<Arc<str>>,
70    /// When true, all warning diagnostics are promoted to error severity
71    warnings_as_errors: bool,
72}
73
74impl ModuleParser {
75    /// Construct a new parser for the given `kind` of [ast::Module].
76    pub fn new(kind: ast::ModuleKind) -> Self {
77        Self {
78            kind,
79            interned: Default::default(),
80            warnings_as_errors: false,
81        }
82    }
83
84    /// Configure this parser so that any warning diagnostics are promoted to errors.
85    pub fn set_warnings_as_errors(&mut self, yes: bool) {
86        self.warnings_as_errors = yes;
87    }
88
89    /// Parse a [ast::Module] from `source`, and give it the provided `path`.
90    pub fn parse(
91        &mut self,
92        path: impl AsRef<Path>,
93        source: Arc<SourceFile>,
94        source_manager: Arc<dyn SourceManager>,
95    ) -> Result<Box<ast::Module>, Report> {
96        let path = path.as_ref();
97        if let Err(err) = Path::validate(path.as_str()) {
98            return Err(Report::msg(err.to_string()).with_source_code(source.clone()));
99        }
100        let forms = parse_forms_internal(source.clone(), &mut self.interned)
101            .map_err(|err| Report::new(err).with_source_code(source.clone()))?;
102        sema::analyze(source, self.kind, path, forms, self.warnings_as_errors, source_manager)
103            .map_err(Report::new)
104    }
105
106    /// Parse a [ast::Module], `name`, from `path`.
107    #[cfg(feature = "std")]
108    pub fn parse_file<N, P>(
109        &mut self,
110        name: N,
111        path: P,
112        source_manager: Arc<dyn SourceManager>,
113    ) -> Result<Box<ast::Module>, Report>
114    where
115        N: AsRef<Path>,
116        P: AsRef<std::path::Path>,
117    {
118        use miden_debug_types::SourceManagerExt;
119        use miden_utils_diagnostics::{IntoDiagnostic, WrapErr};
120
121        let path = path.as_ref();
122        let source_file = source_manager
123            .load_file(path)
124            .into_diagnostic()
125            .wrap_err_with(|| format!("failed to load source file from '{}'", path.display()))?;
126        self.parse(name, source_file, source_manager)
127    }
128
129    /// Parse a [ast::Module], `name`, from `source`.
130    pub fn parse_str(
131        &mut self,
132        name: impl AsRef<Path>,
133        source: impl ToString,
134        source_manager: Arc<dyn SourceManager>,
135    ) -> Result<Box<ast::Module>, Report> {
136        use miden_debug_types::SourceContent;
137
138        let name = name.as_ref();
139        let uri = Uri::from(name.as_str().to_string().into_boxed_str());
140        let content = SourceContent::new(
141            SourceLanguage::Masm,
142            uri.clone(),
143            source.to_string().into_boxed_str(),
144        );
145        let source_file = source_manager.load_from_raw_parts(uri, content);
146        self.parse(name, source_file, source_manager)
147    }
148}
149
150/// This is used in tests to parse `source` as a set of raw [ast::Form]s rather than as a
151/// [ast::Module].
152///
153/// NOTE: This does _not_ run semantic analysis.
154#[cfg(any(test, feature = "testing"))]
155pub fn parse_forms(source: Arc<SourceFile>) -> Result<Vec<ast::Form>, ParsingError> {
156    let mut interned = BTreeSet::default();
157    parse_forms_internal(source, &mut interned)
158}
159
160/// Parse `source` as a set of [ast::Form]s
161///
162/// Aside from catching syntax errors, this does little validation of the resulting forms, that is
163/// handled by semantic analysis, which the caller is expected to perform next.
164fn parse_forms_internal(
165    source: Arc<SourceFile>,
166    interned: &mut BTreeSet<Arc<str>>,
167) -> Result<Vec<ast::Form>, ParsingError> {
168    let source_id = source.id();
169    let scanner = Scanner::new(source.as_str());
170    let lexer = Lexer::new(source_id, scanner);
171    let felt_type = Arc::new(ast::types::ArrayType::new(ast::types::Type::Felt, 4));
172    grammar::FormsParser::new()
173        .parse(source_id, interned, &felt_type, core::marker::PhantomData, lexer)
174        .map_err(|err| ParsingError::from_parse_error(source_id, err))
175}
176
177// DIRECTORY PARSER
178// ================================================================================================
179
180/// Read the contents (modules) of this library from `dir`, returning any errors that occur
181/// while traversing the file system.
182///
183/// Errors may also be returned if traversal discovers issues with the modules, such as
184/// invalid names, etc.
185///
186/// Returns an iterator over all parsed modules.
187#[cfg(feature = "std")]
188pub fn read_modules_from_dir(
189    dir: impl AsRef<std::path::Path>,
190    namespace: impl AsRef<Path>,
191    source_manager: Arc<dyn SourceManager>,
192    warnings_as_errors: bool,
193) -> Result<impl Iterator<Item = Box<ast::Module>>, Report> {
194    use std::collections::{BTreeMap, btree_map::Entry};
195
196    use miden_utils_diagnostics::{IntoDiagnostic, WrapErr, report};
197    use module_walker::{ModuleEntry, WalkModules};
198
199    let dir = dir.as_ref();
200    if !dir.is_dir() {
201        return Err(report!("the provided path '{}' is not a valid directory", dir.display()));
202    }
203
204    // mod.masm is not allowed in the root directory
205    if dir.join(ast::Module::ROOT_FILENAME).exists() {
206        return Err(report!("{} is not allowed in the root directory", ast::Module::ROOT_FILENAME));
207    }
208
209    let mut modules = BTreeMap::default();
210
211    let walker = WalkModules::new(namespace.as_ref().to_path_buf(), dir)
212        .into_diagnostic()
213        .wrap_err_with(|| format!("failed to load modules from '{}'", dir.display()))?;
214    for entry in walker {
215        let ModuleEntry { mut name, source_path } = entry?;
216        if name.last().unwrap() == ast::Module::ROOT {
217            name.pop();
218        }
219
220        // Parse module at the given path
221        let mut parser = ModuleParser::new(ast::ModuleKind::Library);
222        parser.set_warnings_as_errors(warnings_as_errors);
223        let ast = parser.parse_file(&name, &source_path, source_manager.clone())?;
224        match modules.entry(name) {
225            Entry::Occupied(ref entry) => {
226                return Err(report!("duplicate module '{0}'", entry.key().clone()));
227            },
228            Entry::Vacant(entry) => {
229                entry.insert(ast);
230            },
231        }
232    }
233
234    Ok(modules.into_values())
235}
236
237#[cfg(feature = "std")]
238mod module_walker {
239    use std::{
240        ffi::OsStr,
241        fs::{self, DirEntry, FileType},
242        io,
243        path::{Path, PathBuf},
244    };
245
246    use miden_utils_diagnostics::{IntoDiagnostic, Report, report};
247
248    use crate::{Path as LibraryPath, PathBuf as LibraryPathBuf, ast::Module};
249
250    pub struct ModuleEntry {
251        pub name: LibraryPathBuf,
252        pub source_path: PathBuf,
253    }
254
255    pub struct WalkModules<'a> {
256        namespace: LibraryPathBuf,
257        root: &'a Path,
258        stack: alloc::collections::VecDeque<io::Result<DirEntry>>,
259    }
260
261    impl<'a> WalkModules<'a> {
262        pub fn new(namespace: LibraryPathBuf, path: &'a Path) -> io::Result<Self> {
263            use alloc::collections::VecDeque;
264
265            let stack = VecDeque::from_iter(fs::read_dir(path)?);
266
267            Ok(Self { namespace, root: path, stack })
268        }
269
270        fn next_entry(
271            &mut self,
272            entry: &DirEntry,
273            ty: &FileType,
274        ) -> Result<Option<ModuleEntry>, Report> {
275            if ty.is_dir() {
276                let dir = entry.path();
277                self.stack.extend(fs::read_dir(dir).into_diagnostic()?);
278                return Ok(None);
279            }
280
281            let mut file_path = entry.path();
282            let is_module = file_path
283                .extension()
284                .map(|ext| ext == AsRef::<OsStr>::as_ref(Module::FILE_EXTENSION))
285                .unwrap_or(false);
286            if !is_module {
287                return Ok(None);
288            }
289
290            // Remove the file extension and the root prefix, leaving a namespace-relative path
291            file_path.set_extension("");
292            if file_path.is_dir() {
293                return Err(report!(
294                    "file and directory with same name are not allowed: {}",
295                    file_path.display()
296                ));
297            }
298            let relative_path = file_path
299                .strip_prefix(self.root)
300                .expect("expected path to be a child of the root directory");
301
302            // Construct a [LibraryPath] from the path components, after validating them
303            let mut libpath = self.namespace.clone();
304            for component in relative_path.iter() {
305                let component = component.to_str().ok_or_else(|| {
306                    let p = entry.path();
307                    report!("{} is an invalid directory entry", p.display())
308                })?;
309                LibraryPath::validate(component).into_diagnostic()?;
310                libpath.push(component);
311            }
312            Ok(Some(ModuleEntry { name: libpath, source_path: entry.path() }))
313        }
314    }
315
316    impl Iterator for WalkModules<'_> {
317        type Item = Result<ModuleEntry, Report>;
318
319        fn next(&mut self) -> Option<Self::Item> {
320            loop {
321                let entry = self
322                    .stack
323                    .pop_front()?
324                    .and_then(|entry| entry.file_type().map(|ft| (entry, ft)))
325                    .into_diagnostic();
326
327                match entry {
328                    Ok((ref entry, ref file_type)) => {
329                        match self.next_entry(entry, file_type).transpose() {
330                            None => {},
331                            result => break result,
332                        }
333                    },
334                    Err(err) => break Some(Err(err)),
335                }
336            }
337        }
338    }
339}
340
341// TESTS
342// ================================================================================================
343
344#[cfg(test)]
345mod tests {
346    use miden_core::assert_matches;
347    use miden_debug_types::SourceId;
348
349    use super::*;
350
351    // This test checks the lexer behavior with regard to tokenizing `exp(.u?[\d]+)?`
352    #[test]
353    fn lex_exp() {
354        let source_id = SourceId::default();
355        let scanner = Scanner::new("begin exp.u9 end");
356        let mut lexer = Lexer::new(source_id, scanner).map(|result| result.map(|(_, t, _)| t));
357        assert_matches!(lexer.next(), Some(Ok(Token::Begin)));
358        assert_matches!(lexer.next(), Some(Ok(Token::ExpU)));
359        assert_matches!(lexer.next(), Some(Ok(Token::Int(n))) if n == 9);
360        assert_matches!(lexer.next(), Some(Ok(Token::End)));
361    }
362
363    #[test]
364    fn lex_block() {
365        let source_id = SourceId::default();
366        let scanner = Scanner::new(
367            "\
368const ERR1 = 1
369
370begin
371    u32assertw
372    u32assertw.err=ERR1
373    u32assertw.err=2
374end
375",
376        );
377        let mut lexer = Lexer::new(source_id, scanner).map(|result| result.map(|(_, t, _)| t));
378        assert_matches!(lexer.next(), Some(Ok(Token::Const)));
379        assert_matches!(lexer.next(), Some(Ok(Token::ConstantIdent("ERR1"))));
380        assert_matches!(lexer.next(), Some(Ok(Token::Equal)));
381        assert_matches!(lexer.next(), Some(Ok(Token::Int(1))));
382        assert_matches!(lexer.next(), Some(Ok(Token::Begin)));
383        assert_matches!(lexer.next(), Some(Ok(Token::U32Assertw)));
384        assert_matches!(lexer.next(), Some(Ok(Token::U32Assertw)));
385        assert_matches!(lexer.next(), Some(Ok(Token::Dot)));
386        assert_matches!(lexer.next(), Some(Ok(Token::Err)));
387        assert_matches!(lexer.next(), Some(Ok(Token::Equal)));
388        assert_matches!(lexer.next(), Some(Ok(Token::ConstantIdent("ERR1"))));
389        assert_matches!(lexer.next(), Some(Ok(Token::U32Assertw)));
390        assert_matches!(lexer.next(), Some(Ok(Token::Dot)));
391        assert_matches!(lexer.next(), Some(Ok(Token::Err)));
392        assert_matches!(lexer.next(), Some(Ok(Token::Equal)));
393        assert_matches!(lexer.next(), Some(Ok(Token::Int(2))));
394        assert_matches!(lexer.next(), Some(Ok(Token::End)));
395        assert_matches!(lexer.next(), Some(Ok(Token::Eof)));
396    }
397
398    #[test]
399    fn lex_emit() {
400        let source_id = SourceId::default();
401        let scanner = Scanner::new(
402            "\
403begin
404    push.1
405    emit.event(\"abc\")
406end
407",
408        );
409        let mut lexer = Lexer::new(source_id, scanner).map(|result| result.map(|(_, t, _)| t));
410        assert_matches!(lexer.next(), Some(Ok(Token::Begin)));
411        assert_matches!(lexer.next(), Some(Ok(Token::Push)));
412        assert_matches!(lexer.next(), Some(Ok(Token::Dot)));
413        assert_matches!(lexer.next(), Some(Ok(Token::Int(1))));
414        assert_matches!(lexer.next(), Some(Ok(Token::Emit)));
415        assert_matches!(lexer.next(), Some(Ok(Token::Dot)));
416        assert_matches!(lexer.next(), Some(Ok(Token::Event)));
417        assert_matches!(lexer.next(), Some(Ok(Token::Lparen)));
418        assert_matches!(lexer.next(), Some(Ok(Token::QuotedIdent("abc"))));
419        assert_matches!(lexer.next(), Some(Ok(Token::Rparen)));
420        assert_matches!(lexer.next(), Some(Ok(Token::End)));
421        assert_matches!(lexer.next(), Some(Ok(Token::Eof)));
422    }
423
424    #[test]
425    fn lex_invalid_token_after_whitespace_returns_error() {
426        let source_id = SourceId::default();
427        let scanner = Scanner::new("begin \u{0001}\nend\n");
428        let mut lexer = Lexer::new(source_id, scanner).map(|result| result.map(|(_, t, _)| t));
429
430        assert_matches!(lexer.next(), Some(Ok(Token::Begin)));
431        assert_matches!(
432            lexer.next(),
433            Some(Err(ParsingError::InvalidToken { span })) if span.into_range() == (6..7)
434        );
435    }
436
437    #[test]
438    fn lex_invalid_underscore_token_span() {
439        let source_id = SourceId::default();
440        let scanner = Scanner::new("begin _-\nend\n");
441        let mut lexer = Lexer::new(source_id, scanner).map(|result| result.map(|(_, t, _)| t));
442
443        assert_matches!(lexer.next(), Some(Ok(Token::Begin)));
444        assert_matches!(
445            lexer.next(),
446            Some(Err(ParsingError::InvalidToken { span })) if span.into_range() == (6..7)
447        );
448    }
449
450    #[test]
451    fn lex_single_char_token_and_ident_spans() {
452        let source_id = SourceId::default();
453        let scanner = Scanner::new("@\nA\n");
454        let mut lexer = Lexer::new(source_id, scanner);
455
456        assert_matches!(lexer.next(), Some(Ok((0, Token::At, 1))));
457        assert_matches!(lexer.next(), Some(Ok((2, Token::ConstantIdent("A"), 3))));
458    }
459
460    #[test]
461    fn overlong_path_component_is_rejected_without_panic() {
462        use std::{
463            panic::{AssertUnwindSafe, catch_unwind},
464            sync::Arc,
465        };
466
467        use crate::{
468            debuginfo::DefaultSourceManager,
469            parse::{Parse, ParseOptions},
470        };
471
472        let big_component = "a".repeat(256);
473        let source = format!("begin\n    exec.{big_component}::x::foo\nend\n");
474
475        let source_manager = Arc::new(DefaultSourceManager::default());
476        let parsed = catch_unwind(AssertUnwindSafe(|| {
477            source.parse_with_options(source_manager, ParseOptions::default())
478        }));
479
480        assert!(parsed.is_ok(), "parsing panicked, expected a structured error");
481        let err = parsed.unwrap().expect_err("parsing succeeded, expected an error");
482        crate::assert_diagnostic!(err, "this reference is invalid without a corresponding import");
483    }
484}