miden_assembly_syntax/parser/
mod.rs

1/// Simple macro used in the grammar definition for constructing spans
2macro_rules! span {
3    ($id:expr, $l:expr, $r:expr) => {
4        ::miden_debug_types::SourceSpan::new($id, $l..$r)
5    };
6    ($id:expr, $i:expr) => {
7        ::miden_debug_types::SourceSpan::at($id, $i)
8    };
9}
10
11lalrpop_util::lalrpop_mod!(
12    #[allow(clippy::all)]
13    grammar,
14    "/parser/grammar.rs"
15);
16
17mod error;
18mod lexer;
19mod scanner;
20mod token;
21
22use alloc::{boxed::Box, collections::BTreeSet, string::ToString, sync::Arc, vec::Vec};
23
24use miden_debug_types::{SourceFile, SourceLanguage, SourceManager, Uri};
25use miden_utils_diagnostics::Report;
26
27pub use self::{
28    error::{BinErrorKind, HexErrorKind, LiteralErrorKind, ParsingError},
29    lexer::Lexer,
30    scanner::Scanner,
31    token::{BinEncodedValue, DocumentationType, IntValue, PushValue, Token, WordValue},
32};
33use crate::{LibraryPath, ast, sema};
34
35// TYPE ALIASES
36// ================================================================================================
37
38type ParseError<'a> = lalrpop_util::ParseError<u32, Token<'a>, ParsingError>;
39
40// MODULE PARSER
41// ================================================================================================
42
43/// This is a wrapper around the lower-level parser infrastructure which handles orchestrating all
44/// of the pieces needed to parse a [ast::Module] from source, and run semantic analysis on it.
45#[derive(Default)]
46pub struct ModuleParser {
47    /// The kind of module we're parsing.
48    ///
49    /// This is used when performing semantic analysis to detect when various invalid constructions
50    /// are encountered, such as use of the `syscall` instruction in a kernel module.
51    kind: ast::ModuleKind,
52    /// A set of interned strings allocated during parsing/semantic analysis.
53    ///
54    /// This is a very primitive and imprecise way of interning strings, but was the least invasive
55    /// at the time the new parser was implemented. In essence, we avoid duplicating allocations
56    /// for frequently occurring strings, by tracking which strings we've seen before, and
57    /// sharing a reference counted pointer instead.
58    ///
59    /// We may want to replace this eventually with a proper interner, so that we can also gain the
60    /// benefits commonly provided by interned string handles (e.g. cheap equality comparisons, no
61    /// ref- counting overhead, copyable and of smaller size).
62    ///
63    /// Note that [Ident], [ProcedureName], [LibraryPath] and others are all implemented in terms
64    /// of either the actual reference-counted string, e.g. `Arc<str>`, or in terms of [Ident],
65    /// which is essentially the former wrapped in a [SourceSpan]. If we ever replace this with
66    /// a better interner, we will also want to update those types to be in terms of whatever
67    /// the handle type of the interner is.
68    interned: BTreeSet<Arc<str>>,
69    /// When true, all warning diagnostics are promoted to error severity
70    warnings_as_errors: bool,
71}
72
73impl ModuleParser {
74    /// Construct a new parser for the given `kind` of [ast::Module].
75    pub fn new(kind: ast::ModuleKind) -> Self {
76        Self {
77            kind,
78            interned: Default::default(),
79            warnings_as_errors: false,
80        }
81    }
82
83    /// Configure this parser so that any warning diagnostics are promoted to errors.
84    pub fn set_warnings_as_errors(&mut self, yes: bool) {
85        self.warnings_as_errors = yes;
86    }
87
88    /// Parse a [ast::Module] from `source`, and give it the provided `path`.
89    pub fn parse(
90        &mut self,
91        path: LibraryPath,
92        source: Arc<SourceFile>,
93    ) -> Result<Box<ast::Module>, Report> {
94        let forms = parse_forms_internal(source.clone(), &mut self.interned)
95            .map_err(|err| Report::new(err).with_source_code(source.clone()))?;
96        sema::analyze(source, self.kind, path, forms, self.warnings_as_errors).map_err(Report::new)
97    }
98
99    /// Parse a [ast::Module], `name`, from `path`.
100    #[cfg(feature = "std")]
101    pub fn parse_file<P>(
102        &mut self,
103        name: LibraryPath,
104        path: P,
105        source_manager: &dyn SourceManager,
106    ) -> Result<Box<ast::Module>, Report>
107    where
108        P: AsRef<std::path::Path>,
109    {
110        use miden_debug_types::SourceManagerExt;
111        use miden_utils_diagnostics::{IntoDiagnostic, WrapErr};
112
113        let path = path.as_ref();
114        let source_file = source_manager
115            .load_file(path)
116            .into_diagnostic()
117            .wrap_err_with(|| format!("failed to load source file from '{}'", path.display()))?;
118        self.parse(name, source_file)
119    }
120
121    /// Parse a [ast::Module], `name`, from `source`.
122    pub fn parse_str(
123        &mut self,
124        name: LibraryPath,
125        source: impl ToString,
126        source_manager: &dyn SourceManager,
127    ) -> Result<Box<ast::Module>, Report> {
128        use miden_debug_types::SourceContent;
129
130        let uri = Uri::from(name.path().into_owned().into_boxed_str());
131        let content = SourceContent::new(
132            SourceLanguage::Masm,
133            uri.clone(),
134            source.to_string().into_boxed_str(),
135        );
136        let source_file = source_manager.load_from_raw_parts(uri, content);
137        self.parse(name, source_file)
138    }
139}
140
141/// This is used in tests to parse `source` as a set of raw [ast::Form]s rather than as a
142/// [ast::Module].
143///
144/// NOTE: This does _not_ run semantic analysis.
145#[cfg(any(test, feature = "testing"))]
146pub fn parse_forms(source: Arc<SourceFile>) -> Result<Vec<ast::Form>, ParsingError> {
147    let mut interned = BTreeSet::default();
148    parse_forms_internal(source, &mut interned)
149}
150
151/// Parse `source` as a set of [ast::Form]s
152///
153/// Aside from catching syntax errors, this does little validation of the resulting forms, that is
154/// handled by semantic analysis, which the caller is expected to perform next.
155fn parse_forms_internal(
156    source: Arc<SourceFile>,
157    interned: &mut BTreeSet<Arc<str>>,
158) -> Result<Vec<ast::Form>, ParsingError> {
159    let source_id = source.id();
160    let scanner = Scanner::new(source.as_str());
161    let lexer = Lexer::new(source_id, scanner);
162    let felt_type = Arc::new(ast::types::ArrayType::new(ast::types::Type::Felt, 4));
163    grammar::FormsParser::new()
164        .parse(source_id, interned, &felt_type, core::marker::PhantomData, lexer)
165        .map_err(|err| ParsingError::from_parse_error(source_id, err))
166}
167
168// DIRECTORY PARSER
169// ================================================================================================
170
171/// Read the contents (modules) of this library from `dir`, returning any errors that occur
172/// while traversing the file system.
173///
174/// Errors may also be returned if traversal discovers issues with the modules, such as
175/// invalid names, etc.
176///
177/// Returns an iterator over all parsed modules.
178#[cfg(feature = "std")]
179pub fn read_modules_from_dir(
180    namespace: crate::LibraryNamespace,
181    dir: impl AsRef<std::path::Path>,
182    source_manager: &dyn SourceManager,
183) -> Result<impl Iterator<Item = Box<ast::Module>>, Report> {
184    use std::collections::{BTreeMap, btree_map::Entry};
185
186    use miden_utils_diagnostics::{IntoDiagnostic, WrapErr, report};
187    use module_walker::{ModuleEntry, WalkModules};
188
189    let dir = dir.as_ref();
190    if !dir.is_dir() {
191        return Err(report!("the provided path '{}' is not a valid directory", dir.display()));
192    }
193
194    // mod.masm is not allowed in the root directory
195    if dir.join(ast::Module::ROOT_FILENAME).exists() {
196        return Err(report!("{} is not allowed in the root directory", ast::Module::ROOT_FILENAME));
197    }
198
199    let mut modules = BTreeMap::default();
200
201    let walker = WalkModules::new(namespace.clone(), dir)
202        .into_diagnostic()
203        .wrap_err_with(|| format!("failed to load modules from '{}'", dir.display()))?;
204    for entry in walker {
205        let ModuleEntry { mut name, source_path } = entry?;
206        if name.last() == ast::Module::ROOT {
207            name.pop();
208        }
209
210        // Parse module at the given path
211        let mut parser = ModuleParser::new(ast::ModuleKind::Library);
212        let ast = parser.parse_file(name.clone(), &source_path, source_manager)?;
213        match modules.entry(name) {
214            Entry::Occupied(ref entry) => {
215                return Err(report!("duplicate module '{0}'", entry.key().clone()));
216            },
217            Entry::Vacant(entry) => {
218                entry.insert(ast);
219            },
220        }
221    }
222
223    Ok(modules.into_values())
224}
225
226#[cfg(feature = "std")]
227mod module_walker {
228    use std::{
229        ffi::OsStr,
230        fs::{self, DirEntry, FileType},
231        io,
232        path::{Path, PathBuf},
233    };
234
235    use miden_utils_diagnostics::{IntoDiagnostic, Report, report};
236
237    use crate::{LibraryNamespace, LibraryPath, ast::Module};
238
239    pub struct ModuleEntry {
240        pub name: LibraryPath,
241        pub source_path: PathBuf,
242    }
243
244    pub struct WalkModules<'a> {
245        namespace: LibraryNamespace,
246        root: &'a Path,
247        stack: alloc::collections::VecDeque<io::Result<DirEntry>>,
248    }
249
250    impl<'a> WalkModules<'a> {
251        pub fn new(namespace: LibraryNamespace, path: &'a Path) -> io::Result<Self> {
252            use alloc::collections::VecDeque;
253
254            let stack = VecDeque::from_iter(fs::read_dir(path)?);
255
256            Ok(Self { namespace, root: path, stack })
257        }
258
259        fn next_entry(
260            &mut self,
261            entry: &DirEntry,
262            ty: &FileType,
263        ) -> Result<Option<ModuleEntry>, Report> {
264            if ty.is_dir() {
265                let dir = entry.path();
266                self.stack.extend(fs::read_dir(dir).into_diagnostic()?);
267                return Ok(None);
268            }
269
270            let mut file_path = entry.path();
271            let is_module = file_path
272                .extension()
273                .map(|ext| ext == AsRef::<OsStr>::as_ref(Module::FILE_EXTENSION))
274                .unwrap_or(false);
275            if !is_module {
276                return Ok(None);
277            }
278
279            // Remove the file extension and the root prefix, leaving a namespace-relative path
280            file_path.set_extension("");
281            if file_path.is_dir() {
282                return Err(report!(
283                    "file and directory with same name are not allowed: {}",
284                    file_path.display()
285                ));
286            }
287            let relative_path = file_path
288                .strip_prefix(self.root)
289                .expect("expected path to be a child of the root directory");
290
291            // Construct a [LibraryPath] from the path components, after validating them
292            let mut libpath = LibraryPath::from(self.namespace.clone());
293            for component in relative_path.iter() {
294                let component = component.to_str().ok_or_else(|| {
295                    let p = entry.path();
296                    report!("{} is an invalid directory entry", p.display())
297                })?;
298                libpath.push(component).into_diagnostic()?;
299            }
300            Ok(Some(ModuleEntry { name: libpath, source_path: entry.path() }))
301        }
302    }
303
304    impl Iterator for WalkModules<'_> {
305        type Item = Result<ModuleEntry, Report>;
306
307        fn next(&mut self) -> Option<Self::Item> {
308            loop {
309                let entry = self
310                    .stack
311                    .pop_front()?
312                    .and_then(|entry| entry.file_type().map(|ft| (entry, ft)))
313                    .into_diagnostic();
314
315                match entry {
316                    Ok((ref entry, ref file_type)) => {
317                        match self.next_entry(entry, file_type).transpose() {
318                            None => continue,
319                            result => break result,
320                        }
321                    },
322                    Err(err) => break Some(Err(err)),
323                }
324            }
325        }
326    }
327}
328
329// TESTS
330// ================================================================================================
331
332#[cfg(test)]
333mod tests {
334    use miden_core::assert_matches;
335    use miden_debug_types::SourceId;
336
337    use super::*;
338
339    // This test checks the lexer behavior with regard to tokenizing `exp(.u?[\d]+)?`
340    #[test]
341    fn lex_exp() {
342        let source_id = SourceId::default();
343        let scanner = Scanner::new("begin exp.u9 end");
344        let mut lexer = Lexer::new(source_id, scanner).map(|result| result.map(|(_, t, _)| t));
345        assert_matches!(lexer.next(), Some(Ok(Token::Begin)));
346        assert_matches!(lexer.next(), Some(Ok(Token::ExpU)));
347        assert_matches!(lexer.next(), Some(Ok(Token::Int(n))) if n == 9);
348        assert_matches!(lexer.next(), Some(Ok(Token::End)));
349    }
350
351    #[test]
352    fn lex_block() {
353        let source_id = SourceId::default();
354        let scanner = Scanner::new(
355            "\
356const.ERR1=1
357
358begin
359    u32assertw
360    u32assertw.err=ERR1
361    u32assertw.err=2
362end
363",
364        );
365        let mut lexer = Lexer::new(source_id, scanner).map(|result| result.map(|(_, t, _)| t));
366        assert_matches!(lexer.next(), Some(Ok(Token::Const)));
367        assert_matches!(lexer.next(), Some(Ok(Token::Dot)));
368        assert_matches!(lexer.next(), Some(Ok(Token::ConstantIdent("ERR1"))));
369        assert_matches!(lexer.next(), Some(Ok(Token::Equal)));
370        assert_matches!(lexer.next(), Some(Ok(Token::Int(1))));
371        assert_matches!(lexer.next(), Some(Ok(Token::Begin)));
372        assert_matches!(lexer.next(), Some(Ok(Token::U32Assertw)));
373        assert_matches!(lexer.next(), Some(Ok(Token::U32Assertw)));
374        assert_matches!(lexer.next(), Some(Ok(Token::Dot)));
375        assert_matches!(lexer.next(), Some(Ok(Token::Err)));
376        assert_matches!(lexer.next(), Some(Ok(Token::Equal)));
377        assert_matches!(lexer.next(), Some(Ok(Token::ConstantIdent("ERR1"))));
378        assert_matches!(lexer.next(), Some(Ok(Token::U32Assertw)));
379        assert_matches!(lexer.next(), Some(Ok(Token::Dot)));
380        assert_matches!(lexer.next(), Some(Ok(Token::Err)));
381        assert_matches!(lexer.next(), Some(Ok(Token::Equal)));
382        assert_matches!(lexer.next(), Some(Ok(Token::Int(2))));
383        assert_matches!(lexer.next(), Some(Ok(Token::End)));
384        assert_matches!(lexer.next(), Some(Ok(Token::Eof)));
385    }
386
387    #[test]
388    fn lex_emit() {
389        let source_id = SourceId::default();
390        let scanner = Scanner::new(
391            "\
392begin
393    push.1
394    emit.event(\"abc\")
395end
396",
397        );
398        let mut lexer = Lexer::new(source_id, scanner).map(|result| result.map(|(_, t, _)| t));
399        assert_matches!(lexer.next(), Some(Ok(Token::Begin)));
400        assert_matches!(lexer.next(), Some(Ok(Token::Push)));
401        assert_matches!(lexer.next(), Some(Ok(Token::Dot)));
402        assert_matches!(lexer.next(), Some(Ok(Token::Int(1))));
403        assert_matches!(lexer.next(), Some(Ok(Token::Emit)));
404        assert_matches!(lexer.next(), Some(Ok(Token::Dot)));
405        assert_matches!(lexer.next(), Some(Ok(Token::Event)));
406        assert_matches!(lexer.next(), Some(Ok(Token::Lparen)));
407        assert_matches!(lexer.next(), Some(Ok(Token::QuotedIdent("abc"))));
408        assert_matches!(lexer.next(), Some(Ok(Token::Rparen)));
409        assert_matches!(lexer.next(), Some(Ok(Token::End)));
410        assert_matches!(lexer.next(), Some(Ok(Token::Eof)));
411    }
412}